# Feature Engineering

In [40]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris



import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.compose import ColumnTransformer

In [41]:
spotify_df = pd.read_csv('ML_Project/spotify_df.csv')
spotify_df.head()

Unnamed: 0,genre,popularity,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,duration_sn
0,Movie,0,0.611,0.389,0.91,0.0,C#,0.346,-1.828,Major,0.0525,166.969,4/4,0.814,99.373
1,Movie,1,0.246,0.59,0.737,0.0,F#,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816,137.373
2,Movie,3,0.952,0.663,0.131,0.0,C,0.103,-13.879,Minor,0.0362,99.488,5/4,0.368,170.267
3,Movie,0,0.703,0.24,0.326,0.0,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227,152.427
4,Movie,4,0.95,0.331,0.225,0.123,F,0.202,-21.15,Major,0.0456,140.576,4/4,0.39,82.625


In [42]:
spotify_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232641 entries, 0 to 232640
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   genre             232641 non-null  object 
 1   popularity        232641 non-null  int64  
 2   acousticness      232641 non-null  float64
 3   danceability      232641 non-null  float64
 4   energy            232641 non-null  float64
 5   instrumentalness  232641 non-null  float64
 6   key               232641 non-null  object 
 7   liveness          232641 non-null  float64
 8   loudness          232641 non-null  float64
 9   mode              232641 non-null  object 
 10  speechiness       232641 non-null  float64
 11  tempo             232641 non-null  float64
 12  time_signature    232641 non-null  object 
 13  valence           232641 non-null  float64
 14  duration_sn       232641 non-null  float64
dtypes: float64(10), int64(1), object(4)
memory usage: 26.6+ MB


# Encoding Categorical Columns

In [43]:
# Binary encode 'mode'
spotify_df['mode'] = spotify_df['mode'].map({'Minor': 0, 'Major': 1})


In [44]:
spotify_df.head()

Unnamed: 0,genre,popularity,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,duration_sn
0,Movie,0,0.611,0.389,0.91,0.0,C#,0.346,-1.828,1,0.0525,166.969,4/4,0.814,99.373
1,Movie,1,0.246,0.59,0.737,0.0,F#,0.151,-5.559,0,0.0868,174.003,4/4,0.816,137.373
2,Movie,3,0.952,0.663,0.131,0.0,C,0.103,-13.879,0,0.0362,99.488,5/4,0.368,170.267
3,Movie,0,0.703,0.24,0.326,0.0,C#,0.0985,-12.178,1,0.0395,171.758,4/4,0.227,152.427
4,Movie,4,0.95,0.331,0.225,0.123,F,0.202,-21.15,1,0.0456,140.576,4/4,0.39,82.625


In [45]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [46]:
# Features and target
X = spotify_df.drop('popularity', axis=1)
y = spotify_df['popularity']

## Applying Different Scalers

In [47]:
categorical_features = ['genre', 'key', 'time_signature']
numeric_features = ['acousticness','danceability','energy','instrumentalness',
                    'liveness','loudness','speechiness','tempo','valence','duration_sn']

# Column transformer with OneHotEncoder and StandardScaler
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features),  # drop='first' avoids dummy trap
        ('num', StandardScaler(), numeric_features)
    ],
    remainder='passthrough'  # Keep 'mode' as is
)

# Apply transformations
X_std = preprocessor.fit_transform(X)
column_names = preprocessor.get_feature_names_out() # for calculating feature importance with feature names



In [48]:
from pickle import dump
dump(column_names, open('column_names.pkl', 'wb'))

In [49]:
categorical_features = ['genre', 'key', 'time_signature']
numeric_features = ['acousticness','danceability','energy','instrumentalness',
                    'liveness','loudness','speechiness','tempo','valence','duration_sn']

# Column transformer with OneHotEncoder and MinMaxScaler
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features),  # drop='first' avoids dummy trap
        ('num', MinMaxScaler(), numeric_features)
    ],
    remainder='passthrough'  # Keep 'mode' as is
)

# Apply transformations
X_minmax = preprocessor.fit_transform(X)

In [50]:
# import RobustScaler
from sklearn.preprocessing import RobustScaler

categorical_features = ['genre', 'key', 'time_signature']
numeric_features = ['acousticness','danceability','energy','instrumentalness',
                    'liveness','loudness','speechiness','tempo','valence','duration_sn']

# Column transformer with OneHotEncoder and RobustScaler
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features),  # drop='first' avoids dummy trap
        ('num', RobustScaler(), numeric_features)
    ],
    remainder='passthrough'  # Keep 'mode' as is
)

# Apply transformations
X_robust = preprocessor.fit_transform(X)


# Train Test Split

In [51]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.2, random_state=42)

datasets = [{
    'name': 'with standart',
    'X_train' : X_train,
    'X_test' : X_test,
    'y_train' : y_train,
    'y_test' : y_test,
}]

X_train, X_test, y_train, y_test = train_test_split(X_minmax, y, test_size=0.2, random_state=42)
datasets = [{
    'name': 'with minmax',
    'X_train' : X_train,
    'X_test' : X_test,
    'y_train' : y_train,
    'y_test' : y_test,
}]  + datasets

X_train, X_test, y_train, y_test = train_test_split(X_robust, y, test_size=0.2, random_state=42)
datasets = [{
    'name': 'with robust',
    'X_train' : X_train,
    'X_test' : X_test,
    'y_train' : y_train,
    'y_test' : y_test,
}]  + datasets

In [52]:
# Save datasets to a pickle file (you can save any data with this library) for later use

from pickle import dump
dump(datasets, open('datasets.pkl', 'wb'))