In [1]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np

# Read in data
df = pd.read_csv('msd_genre_dataset.txt')

# Remove excess data
df_relevant_features = df.drop(columns=["track_id", "artist_name", "title"])

X = df_relevant_features.drop(columns=['genre'])
y = df_relevant_features['genre']

# Encode nominal genres
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)


# Upsample data
smote = SMOTE()

sampled_X, sampled_y = smote.fit_resample(X,y)

# Standardize upsampled data
standardizer = StandardScaler()
std_X = standardizer.fit_transform(sampled_X)

# Change y to be the same np.shape as std_X
expanded_y = sampled_y[:, np.newaxis]

# Add the sampled y values to the beginning of std_X
std_X = np.concatenate([expanded_y, std_X], axis=1)

# `sweetviz` likes having the whole dataframe for train and test
sampled_X = pd.DataFrame(std_X, columns=df_relevant_features.columns)
sampled_X['genre'] = sampled_y

# Create a new data frame that fits all this criteria
std_dev_limit = 5

new_dfs = sampled_X.query(f"-{std_dev_limit} <= loudness <= {std_dev_limit} &\
                          -{std_dev_limit} <= tempo <= {std_dev_limit} &\
                          -{std_dev_limit} <= time_signature <= {std_dev_limit} &\
                          -{std_dev_limit} <= key <= {std_dev_limit} &\
                          -{std_dev_limit} <= mode <= {std_dev_limit} &\
                          -{std_dev_limit} <= duration <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre1 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre2 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre3 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre4 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre5 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre6 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre7 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre8 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre9 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre10 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre11 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre12 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre1 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre2 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre3 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre4 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre5 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre6 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre7 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre8 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre9 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre10 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre11 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre12 <= {std_dev_limit}")

In [3]:
new_dfs.to_csv('msd_genre_dataset_no_outliers.csv', index=False)