## Data transformation
Before starting with classification, we will transform data and normalize in order to compute distances. 

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [2]:
#scaling, normalization
from sklearn.preprocessing import StandardScaler, MinMaxScaler

#distance matrix (dbscan elbow, hierarchical)
from scipy.spatial.distance import pdist, squareform

In [3]:
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
)
from scikitplot.metrics import plot_roc
from scikitplot.metrics import plot_precision_recall

In [4]:
# importing dataset

df_train = pd.read_csv("dataset (missing + split)/train.csv", skipinitialspace=True) #this will be modified

In [5]:
#Changing udm to duration_ms and features_duration_ms from ms to min - train
df_train['duration_ms'] *= 1/6e4

#Setting popularity as a % - train
df_train['popularity'] /= 100
df_train.rename(columns = {'duration_ms':'duration_min'}, inplace = True)
df_train.rename(columns = {'popularity':'popularity_percent'}, inplace = True)

### Filling of NaN values

In [6]:
#Dealing with mode attribute missing values - train
#Computing p0 and p1 and filling missing values of mode attribute by sampling

p0=df_train['mode'].value_counts()[0]/(len(df_train)-df_train['mode'].isnull().sum())
p1=df_train['mode'].value_counts()[1]/(len(df_train)-df_train['mode'].isnull().sum())
list_of_nan_indexes_train=df_train[df_train['mode'].isnull()].index.tolist()
for i in list_of_nan_indexes_train:
    if np.random.random() < p1:
        df_train.loc[i,'mode'] = 1.0
    else:
        df_train.loc[i,'mode'] = 0.0

In [7]:
#Dealing with time_signature attribute missing values - train
#Computing the array containing the probabilities of every outcome for time_signature
outcomes_of_time_signature = len(df_train['time_signature'].value_counts())
p_array=np.array(df_train['time_signature'].value_counts().sort_index(ascending=True)/(len(df_train)-df_train['time_signature'].isnull().sum()))
#creating a dictionary containing the correspondance between value and sorted index
dict_ts = {0: 0.0, 1: 1.0, 2: 3.0, 3 : 4.0, 4 : 5.0}
list_of_nan_indexes_ts=df_train[df_train['time_signature'].isnull()].index.tolist()
from scipy.stats import multinomial 
#please forgive the dunb programming here, no real deal of time, only ~3000 points to be evaluated
for i in list_of_nan_indexes_ts:
    tmp = multinomial.rvs(1, p_array, size=1, random_state=None)
    array_tmp=np.where(tmp[0][:]==1)
    index=array_tmp[0][0] #implement a dict for the substitution
    df_train.loc[i,'time_signature'] = dict_ts[index]

Let's check if everything is fine

In [8]:
df_train.isnull().sum()

name                         0
duration_min                 0
explicit                     0
popularity_percent           0
artists                      0
album_name                   0
danceability                 0
energy                       0
key                          0
loudness                     0
mode                         0
speechiness                  0
acousticness                 0
instrumentalness             0
liveness                     0
valence                      0
tempo                        0
features_duration_ms         0
time_signature               0
n_beats                      0
n_bars                       0
popularity_confidence    12783
processing                   0
genre                        0
dtype: int64

In [9]:
df_train['genre'].value_counts()

genre
j-dance          750
iranian          750
brazil           750
chicago-house    750
forro            750
idm              750
indian           750
study            750
disney           750
afrobeat         750
mandopop         750
techno           750
sleep            750
spanish          750
j-idol           750
industrial       750
happy            750
bluegrass        750
black-metal      750
breakbeat        750
Name: count, dtype: int64

### Outlier criteria
For now, we do not consider any point as outlier because none of them miss the classification label and considering that this time one of our tasks is to distinguish between musical genra and not-musical genra. 

### Eliminating redundant features

In [10]:
df_train.dtypes

name                      object
duration_min             float64
explicit                    bool
popularity_percent       float64
artists                   object
album_name                object
danceability             float64
energy                   float64
key                        int64
loudness                 float64
mode                     float64
speechiness              float64
acousticness             float64
instrumentalness         float64
liveness                 float64
valence                  float64
tempo                    float64
features_duration_ms       int64
time_signature           float64
n_beats                  float64
n_bars                   float64
popularity_confidence    float64
processing               float64
genre                     object
dtype: object

In [11]:
#cols that gives max complexity or that are highly correlated
column2drop = ['features_duration_ms', 'popularity_confidence', 'processing', 'name', 'artists','album_name', 'n_bars', 'n_beats'] 
df_train.drop(column2drop, axis=1, inplace=True)

In [12]:
df_train.head()

Unnamed: 0,duration_min,explicit,popularity_percent,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genre
0,4.029333,False,0.46,0.69,0.513,5,-12.529,1.0,0.0975,0.302,0.536,0.281,0.635,142.221,4.0,j-dance
1,7.4,False,0.0,0.069,0.196,1,-24.801,0.0,0.0805,0.283,0.934,0.36,0.0336,75.596,4.0,iranian
2,5.558433,False,0.03,0.363,0.854,2,-7.445,1.0,0.0397,8.9e-05,0.8,0.262,0.489,116.628,4.0,black-metal
3,4.496667,False,0.23,0.523,0.585,5,-5.607,1.0,0.0765,0.0664,0.00127,0.117,0.385,140.88,4.0,bluegrass
4,5.127517,False,0.25,0.643,0.687,7,-11.34,1.0,0.0539,0.000576,0.0969,0.299,0.637,143.956,4.0,happy


### Encoding `str` values - creating the various `genre` maps
- first map: 20 classes, one for every genre;
- second map: regrouping per macro genre with the following labels:  

| Label | Macro genre | `genre` list |
| ----------- | ----------- | -------------------------------------------------------------|
| 0 | Electronic | `breakbeat`, `chicago house`, `techno`, `afrobeat`, `idm` , `industrial`|
| 1 | Pop + Asian| `mandopop`,`j-idol`,`happy`,`disney`,`j-dance`, `indian`|
| 2 | Latin | `forro`,`brazil`,`spanish`|
| 3 | Country | `bluegrass`|
| 4 | Metal | `black metal`|
| 5 | Iranian | `iranian`|
| 6 | Others | `sleep`, `study`|

In [13]:
#Creating genre map I -> mapping every genre into an int value in order to have correlation values. 
genre_map={"j-dance":0,"iranian":1,"brazil":2,"chicago-house":3,"forro":4,"idm":5,"indian":6,"study":7,"disney":8,"afrobeat":9,"mandopop":10,"techno":11,"sleep":12,"spanish":13,"j-idol":14,"industrial":15,"happy":16,"bluegrass":17,"black-metal":18,"breakbeat":19}
#Creating genre map II -> mapping in macro-categories as described above
macrogenre_map={"j-dance":1,"iranian":5,"brazil":2,"chicago-house":0,"forro":2,"idm":0,"indian":1,"study":6,"disney":1,"afrobeat":0,"mandopop":1,"techno":0,"sleep":6,"spanish":2,"j-idol":1,"industrial":0,"happy":1,"bluegrass":3,"black-metal":4,"breakbeat":0}

#aa

#### Encoding attributes for pattern mining 

Encoding continuous variables using percentiles. 

In [15]:
#bins the continuous attributes using quartile distribution
df_train["duration_min_bin"] = pd.qcut(df_train["duration_min"], 4)
df_train["popularity_percent_bin"] = pd.qcut(df_train["popularity_percent"], 4)
df_train["danceability_bin"] = pd.qcut(df_train["danceability"], 4)
df_train["energy_bin"] = pd.qcut(df_train["energy"], 4)
df_train["loudness_bin"] = pd.qcut(df_train["loudness"], 4)
df_train["speechiness_bin"] = pd.qcut(df_train["speechiness"], 4)
df_train["acousticness_bin"] = pd.qcut(df_train["acousticness"], 4)
df_train["instrumentalness_bin"] = pd.qcut(df_train["instrumentalness"], 4, duplicates='drop')
df_train["liveness_bin"] = pd.qcut(df_train["liveness"], 4)
df_train["valence_bin"] = pd.qcut(df_train["valence"], 4)
df_train["tempo_bin"] = pd.qcut(df_train["tempo"], 4)





df_train.drop(["duration_min", "popularity_percent", "danceability", "energy", 
               "loudness", "speechiness", "acousticness", "instrumentalness", 
               "liveness", "valence", "tempo"], axis=1, inplace=True)


In [16]:
df_train.head()

Unnamed: 0,explicit,key,mode,time_signature,genre,duration_min_bin,popularity_percent_bin,danceability_bin,energy_bin,loudness_bin,speechiness_bin,acousticness_bin,instrumentalness_bin,liveness_bin,valence_bin,tempo_bin
0,False,5,1.0,4.0,j-dance,"(3.797, 4.815]","(0.42, 0.94]","(0.58, 0.695]","(0.48, 0.709]","(-49.532, -10.636]","(0.0886, 0.939]","(0.155, 0.573]","(0.00313, 0.744]","(0.28, 0.994]","(0.416, 0.664]","(141.986, 220.525]"
1,False,1,0.0,4.0,iranian,"(4.815, 68.671]","(-0.001, 0.14]","(-0.001, 0.441]","(-0.001, 0.48]","(-49.532, -10.636]","(0.051, 0.0886]","(0.155, 0.573]","(0.744, 1.0]","(0.28, 0.994]","(-0.001, 0.196]","(-0.001, 99.939]"
2,False,2,1.0,4.0,black-metal,"(4.815, 68.671]","(-0.001, 0.14]","(-0.001, 0.441]","(0.709, 0.884]","(-10.636, -7.303]","(0.0373, 0.051]","(-0.001, 0.00974]","(0.744, 1.0]","(0.131, 0.28]","(0.416, 0.664]","(99.939, 124.188]"
3,False,5,1.0,4.0,bluegrass,"(3.797, 4.815]","(0.14, 0.24]","(0.441, 0.58]","(0.48, 0.709]","(-7.303, -5.101]","(0.051, 0.0886]","(0.00974, 0.155]","(-0.001, 0.00313]","(0.0979, 0.131]","(0.196, 0.416]","(124.188, 141.986]"
4,False,7,1.0,4.0,happy,"(4.815, 68.671]","(0.24, 0.42]","(0.58, 0.695]","(0.48, 0.709]","(-49.532, -10.636]","(0.051, 0.0886]","(-0.001, 0.00974]","(0.00313, 0.744]","(0.28, 0.994]","(0.416, 0.664]","(141.986, 220.525]"


Now need to do the label encoder for the nominal categorical attributes. 