### Preprocessing Data

Scikit-learn does not accept categorical features by default, hence we need to convert such features into numeric values.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
music = pd.read_csv('../Data/music_clean.csv')
music.head(5)

Unnamed: 0.1,Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,genre
0,36506,60.0,0.896,0.726,214547.0,0.177,2e-06,0.116,-14.824,0.0353,92.934,0.618,1
1,37591,63.0,0.00384,0.635,190448.0,0.908,0.0834,0.239,-4.795,0.0563,110.012,0.637,1
2,37658,59.0,7.5e-05,0.352,456320.0,0.956,0.0203,0.125,-3.634,0.149,122.897,0.228,1
3,36060,54.0,0.945,0.488,352280.0,0.326,0.0157,0.119,-12.02,0.0328,106.063,0.323,1
4,35710,55.0,0.245,0.667,273693.0,0.647,0.000297,0.0633,-7.787,0.0487,143.995,0.3,1


In [None]:
'''

# The genre column of the dataset above originally contains actual categories  but was cleaned for some reason
# Regardless, here is how one-hot encoding can be performed using Pandas

# Suppose genres contains values such as "Jazz","Blues", "Rock", etc.
# use pd.get_dummies()



music_dummies = pd.get_dummies(music['genre'], drop_first=True)



# Note that we specify drop_first=True. This drops the first category column.
# This is done when using the data to train models such as linear regression, which uses
# matrices. Dropping the first column eliminates linear dependence/collinearity among the 
# categories, which would turn the matrices that linear regression works on non-invertible.


# Then, concatenate the encoded variables with the original data

music_dummies = pd.concat([music, music_dummies], axis=1)
music_dummies.drop('genre', axis=1)


# drop the original genre column since it is no longer needed


'''

### Missing Data

In [None]:
'''

# Here's how missing data would be imputed using sklearn's simple imputer

from sklearn.impute import SimpleImputer

# Split the categorical and numeric variables first

X_cat = music['genre'].values.reshape(-1,1)
X_num = music.drop(['genre','popularity'], axis=1).values
y = music['popularity'].values

X_train_cat, X_test_cat, y_train, y_test = train_test_split(X_cat, y, test_size=0.2, random_state=12)
X_train_num, X_test_num, y_train, y_test = train_test_split(X_num, y, test_size=0.2, random_state=12)

imp_cat = SimpleImputer(strategy='most_frequent')
X_train_cat = imp_cat.fit_transform(X_train_cat)
X_test_cat = imp_cat.transform(X_test_cat)

# By defeault, SimpleImputer() uses the 'mean' for strategy

imp_num = SimpleImputer()
X_train_num = imp_num.fit_transform(X_train_num)
X_test_num = imp_num.transform(X_test_num)

X_train = np.append(X_train_num, X_train_cat, axis=1)
X_test = np.append(X_test_num, X_test_cat, axis=1)

'''