Load the data.

Report: https://docs.google.com/document/d/1y1XSL5tRy91a2g2LWPyvDZDbgZ1tkxSpI64DL-aLQsg/edit?usp=sharing

# Processing the Data

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


df = []
names = ["alternative.csv", "blues.csv", "childrens music.csv", 
         "comedy.csv", "electronic.csv", "folk.csv", "hip-hop.csv", 
         "movie.csv", "ska.csv", "soul.csv"]

for i in range(len(names)):
    df.append( pd.read_csv("training-data/" + names[i], header = 0) )
    
df = pd.concat(df, ignore_index=True)

# export to csv bc i'm lazy and don't want to load every single one.
df.to_csv('training-data/all.csv', sep = ',', index = False)
columnsToDrop = ['artist_name', 'track_name', 'track_id', 'time_signature']
columnsToKeep = ['instance_id', 'popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'key', 'mode']
columnsScaled = ['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence'] 

from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder

# drop columns
df = df.drop(columnsToDrop, axis=1)

# interpolate tempo

from sklearn.impute import SimpleImputer
imp_temp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_dura = SimpleImputer(missing_values=np.nan, strategy='mean')

df['tempo'] = df['tempo'].replace('?',  np.nan)
df['tempo'] = imp_temp.fit_transform(df[['tempo']])


# interpolate duration
df['duration_ms'] = df['duration_ms'].replace('-1',  np.nan)
df['duration_ms'] = imp_dura.fit_transform(df[['duration_ms']])
    
# turn key into key with mode + encode
df['key'] = df['key'] + df['mode']
# keyEncoder = preprocessing.LabelEncoder()
# df['key'] = keyEncoder.fit_transform(df['key'])

keyEncoder = OneHotEncoder(handle_unknown='ignore')
encoded = keyEncoder.fit_transform(df[['key']])
encodedB = pd.DataFrame(encoded.toarray(), columns=keyEncoder.categories_)
df = df.merge(encodedB, left_index = True, right_index=True)
display(df)

# encoce major/minor into binary
modeEncoder = preprocessing.LabelEncoder()
df['mode'] = modeEncoder.fit_transform(df['mode'])

# also label encode
# encoce major/minor into binary
genreEncoder = preprocessing.LabelEncoder()
df['genre_label'] = genreEncoder.fit_transform(df['genre'])

# scale
scalers = []

for col in columnsScaled: 
    scaler = preprocessing.StandardScaler()
    x_scaled = scaler.fit_transform(df[[col]])
    df[[col]] = x_scaled
    scalers.append(scaler)
    

df.info()
display(df)

Unnamed: 0,instance_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,...,"(EMajor,)","(EMinor,)","(F#Major,)","(F#Minor,)","(FMajor,)","(FMinor,)","(G#Major,)","(G#Minor,)","(GMajor,)","(GMinor,)"
0,50010,43,0.225000,0.845,238680.0,0.746,0.000088,FMinor,0.0785,-5.655,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,50011,47,0.665000,0.862,166154.0,0.342,0.000082,A#Minor,0.1020,-10.095,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,50012,48,0.238000,0.590,219400.0,0.517,0.000000,AMajor,0.1820,-9.239,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,50013,60,0.000077,0.374,-1.0,0.971,0.000000,BMinor,0.3220,-4.284,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,50014,58,0.002730,0.449,-1.0,0.971,0.000000,A#Major,0.1410,-3.660,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,172995,41,0.113000,0.857,238627.0,0.669,0.016800,CMajor,0.1100,-5.722,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49996,172996,48,0.381000,0.665,243794.0,0.369,0.914000,CMajor,0.1080,-16.111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49997,172997,41,0.071100,0.796,252347.0,0.466,0.000000,CMinor,0.2560,-9.386,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49998,172998,42,0.727000,0.283,377907.0,0.421,0.000095,DMajor,0.3300,-8.112,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 40 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   instance_id       50000 non-null  int64  
 1   popularity        50000 non-null  float64
 2   acousticness      50000 non-null  float64
 3   danceability      50000 non-null  float64
 4   duration_ms       50000 non-null  float64
 5   energy            50000 non-null  float64
 6   instrumentalness  50000 non-null  float64
 7   key               50000 non-null  object 
 8   liveness          50000 non-null  float64
 9   loudness          50000 non-null  float64
 10  mode              50000 non-null  int32  
 11  speechiness       50000 non-null  float64
 12  tempo             50000 non-null  float64
 13  valence           50000 non-null  float64
 14  genre             50000 non-null  object 
 15  (A#Major,)        50000 non-null  float64
 16  (A#Minor,)        50000 non-null  float6

Unnamed: 0,instance_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,...,"(EMinor,)","(F#Major,)","(F#Minor,)","(FMajor,)","(FMinor,)","(G#Major,)","(G#Minor,)","(GMajor,)","(GMinor,)",genre_label
0,50010,0.292171,-0.367420,1.709313,0.391233,0.525115,-0.402381,FMinor,-0.747419,0.700962,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
1,50011,0.511881,0.916117,1.815951,-0.106841,-1.142089,-0.402407,A#Minor,-0.649017,-0.308977,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,50012,0.566809,-0.329498,0.109738,0.258827,-0.419909,-0.402760,AMajor,-0.314031,-0.114268,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,50013,1.225940,-1.023550,-1.245197,-1.247914,1.453633,-0.402760,BMinor,0.272194,1.012814,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,50014,1.116084,-1.015811,-0.774733,-1.247914,1.453633,-0.402760,A#Major,-0.485711,1.154752,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,172995,0.182316,-0.694139,1.784587,0.390869,0.207356,-0.330298,CMajor,-0.615518,0.685722,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
49996,172996,0.566809,0.087652,0.580201,0.426354,-1.030667,3.539531,CMajor,-0.623893,-1.677398,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
49997,172997,0.182316,-0.816367,1.401943,0.485091,-0.630373,-0.402760,CMinor,-0.004169,-0.147705,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
49998,172998,0.237243,1.096979,-1.816025,1.347377,-0.816076,-0.402353,DMajor,0.305693,0.142084,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9


In [8]:
# remove columns used for understanding data

NRCOMPO = 15

df.info()

droppable = ['instance_id', 'genre', 'key']

df = df.drop(droppable, axis=1)
df.info()

df.to_csv('training-processed-base.csv', sep = ',', index = False)

gen_train = df.copy()

# do dimensionality reduction
# from tutorial

from sklearn.decomposition import PCA

dfCopy = df.copy()
inform = dfCopy.loc[:, dfCopy. columns != 'genre_label']
answer1 = dfCopy['genre_label']

pca = PCA(n_components=NRCOMPO) 
pca.fit(inform)
pca_train = pd.DataFrame(pca.transform(inform))

pca_train.info()

pca_train.to_csv('training-processed-pca.csv', sep = ',', index = False)

# do dimensionality reduction
# from tutorial

from sklearn.decomposition import FastICA

dfCopy = df.copy()
inform = dfCopy.loc[:, dfCopy. columns != 'genre_label']
answer2 = dfCopy['genre_label']

ica = FastICA(n_components=NRCOMPO) 
ica.fit(inform)
ica_train = pd.DataFrame(pca.transform(inform))

ica_train.info()

ica_train.to_csv('training-processed-ica.csv', sep = ',', index = False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 40 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   instance_id       50000 non-null  int64  
 1   popularity        50000 non-null  float64
 2   acousticness      50000 non-null  float64
 3   danceability      50000 non-null  float64
 4   duration_ms       50000 non-null  float64
 5   energy            50000 non-null  float64
 6   instrumentalness  50000 non-null  float64
 7   key               50000 non-null  object 
 8   liveness          50000 non-null  float64
 9   loudness          50000 non-null  float64
 10  mode              50000 non-null  int32  
 11  speechiness       50000 non-null  float64
 12  tempo             50000 non-null  float64
 13  valence           50000 non-null  float64
 14  genre             50000 non-null  object 
 15  (A#Major,)        50000 non-null  float64
 16  (A#Minor,)        50000 non-null  float6



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       50000 non-null  float64
 1   1       50000 non-null  float64
 2   2       50000 non-null  float64
 3   3       50000 non-null  float64
 4   4       50000 non-null  float64
 5   5       50000 non-null  float64
 6   6       50000 non-null  float64
 7   7       50000 non-null  float64
 8   8       50000 non-null  float64
 9   9       50000 non-null  float64
 10  10      50000 non-null  float64
 11  11      50000 non-null  float64
 12  12      50000 non-null  float64
 13  13      50000 non-null  float64
 14  14      50000 non-null  float64
dtypes: float64(15)
memory usage: 5.7 MB




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       50000 non-null  float64
 1   1       50000 non-null  float64
 2   2       50000 non-null  float64
 3   3       50000 non-null  float64
 4   4       50000 non-null  float64
 5   5       50000 non-null  float64
 6   6       50000 non-null  float64
 7   7       50000 non-null  float64
 8   8       50000 non-null  float64
 9   9       50000 non-null  float64
 10  10      50000 non-null  float64
 11  11      50000 non-null  float64
 12  12      50000 non-null  float64
 13  13      50000 non-null  float64
 14  14      50000 non-null  float64
dtypes: float64(15)
memory usage: 5.7 MB


In [9]:
# process test data

df = pd.read_csv("test-data/test.csv", header = 0)

# drop columns
df = df.drop(columnsToDrop, axis=1)

# turn key into key with mode + encode
df['key'] = df['key'] + df['mode']
encoded = keyEncoder.transform(df[['key']])
encodedB = pd.DataFrame(encoded.toarray(), columns=keyEncoder.categories_)
df = df.merge(encodedB, left_index = True, right_index=True)

# encoce major/minor into binary
df['mode'] = modeEncoder.transform(df['mode'])

# interpolate duration + tempo

df['tempo'] = df['tempo'].replace('?',  np.nan)
df['tempo'] = imp_temp.transform(df[['tempo']])
df['duration_ms'] = df['duration_ms'].replace('-1',  np.nan)
df['duration_ms'] = imp_dura.transform(df[['duration_ms']])

# scale

for i in range(len(columnsScaled) ): 
    col = columnsScaled[i]
    scaler = scalers[i]
    x_scaled = scaler.transform(df[[col]])
    df[[col]] = x_scaled

# remove columns used for understanding data + save base
droppable = ['instance_id', 'key']
instanceIds = df['instance_id']
df = df.drop(droppable, axis=1)
df.to_csv('test-processed-base.csv', sep = ',', index = False)

display(df.info())
# do dimensionality reduction
# from tutorial

display(df)

gen_test = pd.DataFrame(df.copy())

dfCopy = df.copy()
pca_test = pd.DataFrame(pca.transform(dfCopy))
pca_test.to_csv('test-processed-pca.csv', sep = ',', index = False)

# do dimensionality reduction
# from tutorial

dfCopy = df.copy()
ica_test = pd.DataFrame(ica.transform(dfCopy)) 
ica_test.to_csv('test-processed-pca.csv', sep = ',', index = False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30931 entries, 0 to 30930
Data columns (total 36 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   popularity        30931 non-null  float64
 1   acousticness      30931 non-null  float64
 2   danceability      30931 non-null  float64
 3   duration_ms       30931 non-null  float64
 4   energy            30931 non-null  float64
 5   instrumentalness  30931 non-null  float64
 6   liveness          30931 non-null  float64
 7   loudness          30931 non-null  float64
 8   mode              30931 non-null  int32  
 9   speechiness       30931 non-null  float64
 10  tempo             30931 non-null  float64
 11  valence           30931 non-null  float64
 12  (A#Major,)        30931 non-null  float64
 13  (A#Minor,)        30931 non-null  float64
 14  (AMajor,)         30931 non-null  float64
 15  (AMinor,)         30931 non-null  float64
 16  (BMajor,)         30931 non-null  float6

None

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,mode,speechiness,...,"(EMajor,)","(EMinor,)","(F#Major,)","(F#Minor,)","(FMajor,)","(FMinor,)","(G#Major,)","(G#Minor,)","(GMajor,)","(GMinor,)"
0,0.182316,-0.279906,0.793478,0.254247,1.362845,0.701426,-0.664510,1.052393,1,0.835730,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.017533,0.038061,0.479835,-0.155511,-1.331919,-0.402727,-0.523397,-0.148387,0,-0.539209,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.951302,0.274348,-0.724551,0.364601,-0.799569,-0.402760,-0.777149,-0.401100,1,0.271216,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-1.685222,-0.055287,1.151030,-0.707021,-0.312614,-0.402760,0.318255,0.741450,0,-0.560475,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.896374,-1.022074,0.141102,0.286483,1.135874,-0.402755,0.288944,1.051256,0,-0.543462,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30926,-2.069715,0.046812,0.228922,0.165704,-0.498317,-0.402760,-0.759562,0.350213,0,-0.457238,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
30927,1.171012,-1.023624,-1.226378,0.478128,1.313324,2.090286,-0.793061,0.873152,0,-0.417413,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
30928,0.292171,1.554968,0.191285,-1.247914,-1.125582,3.548157,-0.640642,-0.624241,0,-0.551195,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
30929,1.061157,0.892780,-2.123395,1.717949,-0.448796,0.861016,1.205968,-1.060289,1,-0.475798,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0




# ML

In [10]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.model_selection import train_test_split

seedInt = 42

trainingSets = []
testSets = []

sets = [gen_train.loc[:, gen_train. columns != 'genre_label'], pca_train, ica_train]
answers = [gen_train['genre_label'], answer1, answer2]

for i in range(len(sets)):
    answer = answers[i]
    theSet = sets[i]
    X_train, X_test, y_train, y_test = train_test_split(theSet, 
                                                        answer ,
                                                        test_size=0.33, random_state=seedInt)
    trainingSets.append([X_train, y_train])   
    testSets.append([X_train, y_train])      
    
loopIter=  500
                                                                                                 
mlModels = [[tree.DecisionTreeClassifier(random_state=seedInt), 
             
             RandomForestClassifier(random_state=seedInt), 
            MLPClassifier(random_state=seedInt, max_iter=loopIter)]
            , [tree.DecisionTreeClassifier(random_state=seedInt), 
               
               RandomForestClassifier(random_state=seedInt), 
            MLPClassifier(random_state=seedInt, max_iter=loopIter)]
            , [tree.DecisionTreeClassifier(random_state=seedInt), 
             
               RandomForestClassifier(random_state=seedInt), 
            MLPClassifier(random_state=seedInt, max_iter=loopIter)]]
modelName = ["DecisionTree",  
             "RandomForest", "MLPClassifiers"]

realTestSets = [gen_test, pca_test, ica_test]
realAnswerSets = [[], [], [], []]

scores = []


for i in range(len(trainingSets)): 
    print("Set Nr", i)
    trainingSet = trainingSets[i]
    testSet = testSets[i]
    
    answerSet = realAnswerSets[i] # save set here
    
    scoreList = []
    scores.append(scoreList)
    
    for j in range( len(mlModels[i])):
        model = mlModels[i][j]
        model.fit(trainingSet[0], trainingSet[1])
        score = model.score(testSet[0], testSet[1])
        print(modelName[j], score)
        scoreList.append(score)
        answerSet.append(model.predict(realTestSets[i]))        
        


Set Nr 0




DecisionTree 0.9957910447761193




RandomForest 0.9957611940298507




MLPClassifiers 0.6625074626865671
Set Nr 1
DecisionTree 0.9957910447761193
RandomForest 0.9957910447761193
MLPClassifiers 0.6452537313432836
Set Nr 2
DecisionTree 0.9957910447761193
RandomForest 0.9957910447761193
MLPClassifiers 0.6452537313432836


In [11]:
# find most common answer for EACH MODEL 
import csv

def getValuesAt(answers, row, scores):
    returnVal = []
    for iSet in range(len(answers)):
        for iModel in range(len(answers[iSet])):
            answer = answers[iSet][iModel][row]
            score = scores[iSet][iModel]
            
            returnVal.append([answer, score])
    
    return returnVal
            
def getBestValues(values): 
    counts = [0] * len(genreEncoder.classes_)
    
    for value in values: # count values
        index = value[0]
        counts[index] = counts[index] + 1
            
        
    # find best
    bestValue = []
    bestCount = 0
    
    for i in range(len(counts)):     # go through all genres
        count = counts[i]
        if(count > bestCount): # if count of current genre is better, set that
            bestValue = []
            bestValue.append(i)
            bestCount = count 
        elif (count == bestCount): # add if there's an equal situatiion
            bestValue.append(i)
    
    if(len(bestValue) != 1): # if there are more than 2 "winners"
        bestScores = [0] * len(bestValue)

        for entry in values:
            theValue = entry[0]
            theScore = entry[1]
            if(theValue in bestValue): # is this one of the best values
                if ( bestScores[bestValue.index(theValue)] < entry[1]):  # is the score highest
                     bestScores[bestValue.index(theValue)] = entry[1]

        maxScoreIndex = bestScores.index(max(bestScores))

        return maxScoreIndex
    
    return bestValue[0]


answers = [['instance_id', 'genre']] # [set][rows of instances]

for row in range(len(instanceIds)): # for each find the best
    possibleValues = getValuesAt(realAnswerSets, row, scores)
    bestValues = getBestValues(possibleValues)

    theGenre = genreEncoder.inverse_transform([bestValues])[0]


    answers.append([instanceIds[row], 
                    theGenre
                   ])


with open("my_answer4.csv","w+") as my_csv:
    csvWriter = csv.writer(my_csv,delimiter=',')
    csvWriter.writerows(answers)




# Analysis Notes

The data contains 17 features, the id and the genre. Of those, every feature has every piece of information (meaning no missing data), there are 8 categorical features and 11 numerical or continuous features. There are 5000 instances in the data set. However, further processing will show that there *are* missing features: for example, the duration is set to -1 for some, which is impossible and thus missing data.

instance_id is a unique id for each instance. It needs to be kept for the Kaggle competition.

artist_name has an option for empty_field. A Google search shows no artist with that name; that's missing data. There is an unlimited number of artists possible, so there's no way to interpolate that correctly, as it may be someone not in the data set. While it may help to connect an artist to what their primary genre is, we do not have the data on every single artist in the world, meaning it's impossible to use the data for that. It'll be dropped.

track_name does not appear to have missing values. That being said, it'd be very hard to encode, as almost every title track is unique. This may provide a hint on what the song is about (love, ...), which may contribute to the algorithm, but for now, I will be dropping this column for a "minimum viable product" type of report; if I have time, I'll add it later.

The track ID appears to be another unique identifier for the instance, potentially linking it to an mp3 file, or similar. We were not given that, and thus, I decided to drop it.

Popularity will be standardised, as will acoutstiness, danceability, duration_ms, energy, instrumentalness, liveness, loudness, speechiness, tempo, valence, as all are numerical values that may contribute to finding the genre.

Mode and key are a special situation: the data set actually does a mistake, as C Minor and C Major are not the same key (quoting Wikipedia: "The key may be in the major or minor mode, though musicians assume major when this is not specified, e.g., "This piece is in C" implies that the key of the song is C major. Popular songs are usually in a key, and so is classical music during the common practice period, around 1650–1900. Longer pieces in the classical repertoire may have sections in contrasting keys." (https://en.wikipedia.org/wiki/Key_(music) on 10/09/2022) and my Piano teacher), and thus, I intend to merge them into one column, and then  encode them. However, major/minor does introduce a general mood to the piece of music (happy versus mellow, for example), and I will ALSO keep it as its own column. However, sicne C Minor and C Major are different keys, I'll keep them sepperate. 

Tempo signature needs to be changed. It's currently a date, when in fact, the correct display would be 4/4, ... Now, 4/4 and 2/2 may both equal 1, but musically, they're not the same, as 4/4 puts more emphasis on the 3rd beat, which 2/2 doesn't do. This needs to be encoded; whenever categorical or ordinal. On the other hand, it can be argued that the difference is minimal, and thus, just using a float of the division may work. Ultimatively, due to time constraints, I decided to drop the feature, and only return to it if I had time.

In [12]:
# # one hot encode genre
# # i am doing this for correlation
# dummies = pd.get_dummies(df.genre)
# df = pd.concat([df, dummies], axis=1)

# # heat map showing correlation between multiple features (ignoring the one hot encoded features)
# plt.figure(figsize=(10,10))
# smaller = df[['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'mode','speechiness', 'tempo', 'valence', 'genre_label']]
# cor = smaller.corr(method='pearson')
# sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
# plt.show()

# # linking each numeric features w/ the genres

# numericFeatures = ['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'mode','speechiness', 'tempo', 'valence']

# # heat map showing correlation between multiple features (ignoring the one hot encoded features)
# for f in numericFeatures:
#     plt.figure(figsize=(10,5))
#     features = ['Alternative', 'Blues', "Children's Music", 'Comedy', 'Electronic', 'Folk', 'Hip-Hop', 'Movie', 'Ska', 'Soul', 'genre_label']
#     features.append(f)
#     smaller = df[features]
#     cor = smaller.corr(method='pearson')
#     sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
#     plt.show()
    
#     df.plot.scatter(x = 'genre', y = f, s = 100)
    
#     plt.figure(figsize=(10,5))
#     fig, axes = plt.subplots()
#     sns.violinplot('genre',f, data=df, ax = axes)
#     axes.set_title(f)
#     plt.show()


a**Please note that the code will crash in the above code because it was used during data analysis, the code changed and is just kept for record purposes.**

This heat map shows especially strong correlation between energy and loudness (to be expected; loud music is usually considered energetic) and high speechiness and liveness (also to be espected). Comparing the difference in other correlations between energy and loudness shows similar results, but in my opinion, they are different enough (especially in liveness, instrumentalness, and accoutsticness, as well as popularity) that they do not warrant one of them being removed. The same applies to speechiness and liveness. 

These heat maps show if one specific genre is especially correlated to a numeric feature. For example, it shows that the most correlatated genre of popularity is Hip-Hop at 0.38, etc.

Links between Features and Genres:

Popularity: Hip-Hop
Acousticness: Comedy, followed closely by Movie
Danceability: Hip-Hop
Duration_MS: All equally low.
Energy: Ska, followed by Alternative
Instrumentalness: Electronic
Liveness: Comedy
Loudness: Ska, Alternative
Mode:  Electronic
Speechiness: Comedy
Tempo: all very low
Valence: Ska