Load the data.

Report: https://docs.google.com/document/d/1y1XSL5tRy91a2g2LWPyvDZDbgZ1tkxSpI64DL-aLQsg/edit?usp=sharing

# Processing the Data

In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


df = []
names = ["alternative.csv", "blues.csv", "childrens music.csv", 
         "comedy.csv", "electronic.csv", "folk.csv", "hip-hop.csv", 
         "movie.csv", "ska.csv", "soul.csv"]

for i in range(len(names)):
    df.append( pd.read_csv("training-data/" + names[i], header = 0) )
    
df = pd.concat(df, ignore_index=True)

# export to csv bc i'm lazy and don't want to load every single one.
df.to_csv('training-data/all.csv', sep = ',', index = False)
columnsToDrop = ['artist_name', 'track_name', 'track_id', 'time_signature']
columnsToKeep = ['instance_id', 'popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'key', 'mode']
columnsScaled = ['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence'] 

from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder

# drop columns
df = df.drop(columnsToDrop, axis=1)

# interpolate tempo
df['tempo'] = df['tempo'].replace('?',  np.nan)
df['tempo'] = df['tempo'].interpolate()
    
# turn key into key with mode + encode
df['key'] = df['key'] + df['mode']
# keyEncoder = preprocessing.LabelEncoder()
# df['key'] = keyEncoder.fit_transform(df['key'])

keyEncoder = OneHotEncoder(handle_unknown='ignore')
encoded = keyEncoder.fit_transform(df[['key']])
encodedB = pd.DataFrame(encoded.toarray(), columns=keyEncoder.categories_)
df = df.merge(encodedB, left_index = True, right_index=True)
display(df)

# encoce major/minor into binary
modeEncoder = preprocessing.LabelEncoder()
df['mode'] = modeEncoder.fit_transform(df['mode'])

# also label encode
# encoce major/minor into binary
genreEncoder = preprocessing.LabelEncoder()
df['genre_label'] = genreEncoder.fit_transform(df['genre'])

# interpolate duration
df['duration_ms'] = df['duration_ms'].replace('-1',  np.nan)
df['duration_ms'] = df['duration_ms'].interpolate()

# scale
scalers = []

for col in columnsScaled: 
    scaler = preprocessing.StandardScaler()
    x_scaled = scaler.fit_transform(df[[col]])
    df[[col]] = x_scaled
    scalers.append(scaler)
    

df.info()
display(df)

Unnamed: 0,instance_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,...,"(EMajor,)","(EMinor,)","(F#Major,)","(F#Minor,)","(FMajor,)","(FMinor,)","(G#Major,)","(G#Minor,)","(GMajor,)","(GMinor,)"
0,50010,43,0.225000,0.845,238680,0.746,0.000088,FMinor,0.0785,-5.655,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,50011,47,0.665000,0.862,166154,0.342,0.000082,A#Minor,0.1020,-10.095,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,50012,48,0.238000,0.590,219400,0.517,0.000000,AMajor,0.1820,-9.239,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,50013,60,0.000077,0.374,-1,0.971,0.000000,BMinor,0.3220,-4.284,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,50014,58,0.002730,0.449,-1,0.971,0.000000,A#Major,0.1410,-3.660,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42534,137534,17,0.000643,0.388,173093,0.985,0.005440,FMajor,0.3200,-4.374,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42535,137535,25,0.007360,0.427,168387,0.995,0.000312,GMajor,0.0829,-4.661,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42536,137536,34,0.000051,0.452,133933,0.988,0.010700,EMinor,0.1740,-4.356,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42537,137537,39,0.008660,0.566,247080,0.577,0.008960,A#Minor,0.0758,-9.719,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 36193 entries, 0 to 42538
Data columns (total 40 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   instance_id       36193 non-null  int64  
 1   popularity        36193 non-null  float64
 2   acousticness      36193 non-null  float64
 3   danceability      36193 non-null  float64
 4   duration_ms       36193 non-null  float64
 5   energy            36193 non-null  float64
 6   instrumentalness  36193 non-null  float64
 7   key               36193 non-null  object 
 8   liveness          36193 non-null  float64
 9   loudness          36193 non-null  float64
 10  mode              36193 non-null  int32  
 11  speechiness       36193 non-null  float64
 12  tempo             36193 non-null  float64
 13  valence           36193 non-null  float64
 14  genre             36193 non-null  object 
 15  (A#Major,)        36193 non-null  float64
 16  (A#Minor,)        36193 non-null  float6

Unnamed: 0,instance_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,...,"(EMinor,)","(F#Major,)","(F#Minor,)","(FMajor,)","(FMinor,)","(G#Major,)","(G#Minor,)","(GMajor,)","(GMinor,)",genre_label
0,50010,0.313707,-0.400158,1.719066,0.366741,0.522142,-0.409999,FMinor,-0.763154,0.706165,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
1,50011,0.525006,0.854842,1.825142,-0.106440,-1.123122,-0.410024,A#Minor,-0.668294,-0.271250,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,50012,0.577831,-0.363078,0.127933,0.240952,-0.410446,-0.410369,AMajor,-0.345367,-0.082811,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,50013,1.211731,-1.041699,-1.219851,-1.190484,1.438439,-0.410369,BMinor,0.219755,1.007975,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,50014,1.106081,-1.034132,-0.751870,-1.190484,1.438439,-0.410369,A#Major,-0.510867,1.145341,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42534,137534,-1.059742,-1.040085,-1.132494,-0.061168,1.495453,-0.387472,FMajor,0.211682,0.988162,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8
42535,137535,-0.637143,-1.020926,-0.889145,-0.091872,1.536178,-0.409056,GMajor,-0.745393,0.924983,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8
42536,137536,-0.161718,-1.041773,-0.733151,-0.316660,1.507671,-0.365333,EMinor,-0.377660,0.992125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8
42537,137537,0.102407,-1.017218,-0.021821,0.421545,-0.166100,-0.372656,A#Minor,-0.774053,-0.188478,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8


In [70]:
# remove columns used for understanding data

df.info()

droppable = ['instance_id', 'genre', 'key']

df = df.drop(droppable, axis=1)
df.info()

df.to_csv('training-processed-base.csv', sep = ',', index = False)

# do dimensionality reduction
# from tutorial

from sklearn.decomposition import PCA

dfCopy = df.copy()
inform = dfCopy.loc[:, dfCopy. columns != 'genre_label']
answer = dfCopy['genre_label']

pca = PCA(n_components=5) 
pca.fit(inform)
pca_train = pd.DataFrame(pca.transform(inform))

pca_train.info()

pca_train.to_csv('training-processed-pca.csv', sep = ',', index = False)

# do dimensionality reduction
# from tutorial

from sklearn.decomposition import FastICA

dfCopy = df.copy()
inform = dfCopy.loc[:, dfCopy. columns != 'genre_label']
answer = dfCopy['genre_label']

ica = FastICA(n_components=5) 
ica.fit(inform)
ica_train = pd.DataFrame(pca.transform(inform))

ica_train.info()

ica_train.to_csv('training-processed-ica.csv', sep = ',', index = False)

# from tutorial

dfCopy = df.copy()
inform = dfCopy.loc[:, dfCopy. columns != 'genre_label']
answer = dfCopy['genre_label']

!pip install gplearn

from gplearn.genetic import SymbolicTransformer
gp = SymbolicTransformer(n_components=5)
gp.fit(inform, answer)
gp_train = pd.DataFrame(gp.transform(inform))

gp_train.info()

gp_train.to_csv('training-processed-gpa.csv', sep = ',', index = False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36193 entries, 0 to 42538
Data columns (total 40 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   instance_id       36193 non-null  int64  
 1   popularity        36193 non-null  float64
 2   acousticness      36193 non-null  float64
 3   danceability      36193 non-null  float64
 4   duration_ms       36193 non-null  float64
 5   energy            36193 non-null  float64
 6   instrumentalness  36193 non-null  float64
 7   key               36193 non-null  object 
 8   liveness          36193 non-null  float64
 9   loudness          36193 non-null  float64
 10  mode              36193 non-null  int32  
 11  speechiness       36193 non-null  float64
 12  tempo             36193 non-null  float64
 13  valence           36193 non-null  float64
 14  genre             36193 non-null  object 
 15  (A#Major,)        36193 non-null  float64
 16  (A#Minor,)        36193 non-null  float6



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36193 entries, 0 to 36192
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       36193 non-null  float64
 1   1       36193 non-null  float64
 2   2       36193 non-null  float64
 3   3       36193 non-null  float64
 4   4       36193 non-null  float64
dtypes: float64(5)
memory usage: 1.4 MB




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36193 entries, 0 to 36192
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       36193 non-null  float64
 1   1       36193 non-null  float64
 2   2       36193 non-null  float64
 3   3       36193 non-null  float64
 4   4       36193 non-null  float64
dtypes: float64(5)
memory usage: 1.4 MB




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36193 entries, 0 to 36192
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       36193 non-null  float64
 1   1       36193 non-null  float64
 2   2       36193 non-null  float64
 3   3       36193 non-null  float64
 4   4       36193 non-null  float64
dtypes: float64(5)
memory usage: 1.4 MB


In [79]:
# process test data

df = pd.read_csv("test-data/test.csv", header = 0)

# drop columns
df = df.drop(columnsToDrop, axis=1)

# turn key into key with mode + encode
df['key'] = df['key'] + df['mode']
encoded = keyEncoder.transform(df[['key']])
encodedB = pd.DataFrame(encoded.toarray(), columns=keyEncoder.categories_)
df = df.merge(encodedB, left_index = True, right_index=True)

# encoce major/minor into binary
df['mode'] = modeEncoder.transform(df['mode'])

# interpolate duration + tempo

df['tempo'] = df['tempo'].replace('?',  np.nan)
df['tempo'] = df['tempo'].interpolate()
df['duration_ms'] = df['duration_ms'].replace('-1',  np.nan)
df['duration_ms'] = df['duration_ms'].interpolate()

# scale

for i in range(len(columnsScaled) ): 
    col = columnsScaled[i]
    scaler = scalers[i]
    x_scaled = scaler.transform(df[[col]])
    df[[col]] = x_scaled

# remove columns used for understanding data + save base
droppable = ['instance_id', 'key']
df = df.drop(droppable, axis=1)
df.to_csv('test-processed-base.csv', sep = ',', index = False)

display(df.info())
# do dimensionality reduction
# from tutorial

dfCopy = df.copy()
pca_test = pd.DataFrame(pca.transform(dfCopy))
pca_test.to_csv('test-processed-pca.csv', sep = ',', index = False)

# do dimensionality reduction
# from tutorial

dfCopy = df.copy()
ica_test = pd.DataFrame(ica.transform(dfCopy)) 
ica_test.to_csv('test-processed-pca.csv', sep = ',', index = False)

# from tutorial

dfCopy = df.copy()
gp_test = pd.DataFrame(gp.transform(dfCopy))
gp_test.to_csv('test-processed-gpa.csv', sep = ',', index = False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30931 entries, 0 to 30930
Data columns (total 36 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   popularity        30931 non-null  float64
 1   acousticness      30931 non-null  float64
 2   danceability      30931 non-null  float64
 3   duration_ms       30931 non-null  float64
 4   energy            30931 non-null  float64
 5   instrumentalness  30931 non-null  float64
 6   liveness          30931 non-null  float64
 7   loudness          30931 non-null  float64
 8   mode              30931 non-null  int32  
 9   speechiness       30931 non-null  float64
 10  tempo             26333 non-null  float64
 11  valence           30931 non-null  float64
 12  (A#Major,)        30931 non-null  float64
 13  (A#Minor,)        30931 non-null  float64
 14  (AMajor,)         30931 non-null  float64
 15  (AMinor,)         30931 non-null  float64
 16  (BMajor,)         30931 non-null  float6

None



ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

# ML

# Analysis Notes

The data contains 17 features, the id and the genre. Of those, every feature has every piece of information (meaning no missing data), there are 8 categorical features and 11 numerical or continuous features. There are 5000 instances in the data set. However, further processing will show that there *are* missing features: for example, the duration is set to -1 for some, which is impossible and thus missing data.

instance_id is a unique id for each instance. It needs to be kept for the Kaggle competition.

artist_name has an option for empty_field. A Google search shows no artist with that name; that's missing data. There is an unlimited number of artists possible, so there's no way to interpolate that correctly, as it may be someone not in the data set. While it may help to connect an artist to what their primary genre is, we do not have the data on every single artist in the world, meaning it's impossible to use the data for that. It'll be dropped.

track_name does not appear to have missing values. That being said, it'd be very hard to encode, as almost every title track is unique. This may provide a hint on what the song is about (love, ...), which may contribute to the algorithm, but for now, I will be dropping this column for a "minimum viable product" type of report; if I have time, I'll add it later.

The track ID appears to be another unique identifier for the instance, potentially linking it to an mp3 file, or similar. We were not given that, and thus, I decided to drop it.

Popularity will be standardised, as will acoutstiness, danceability, duration_ms, energy, instrumentalness, liveness, loudness, speechiness, tempo, valence, as all are numerical values that may contribute to finding the genre.

Mode and key are a special situation: the data set actually does a mistake, as C Minor and C Major are not the same key (quoting Wikipedia: "The key may be in the major or minor mode, though musicians assume major when this is not specified, e.g., "This piece is in C" implies that the key of the song is C major. Popular songs are usually in a key, and so is classical music during the common practice period, around 1650–1900. Longer pieces in the classical repertoire may have sections in contrasting keys." (https://en.wikipedia.org/wiki/Key_(music) on 10/09/2022) and my Piano teacher), and thus, I intend to merge them into one column, and then  encode them. However, major/minor does introduce a general mood to the piece of music (happy versus mellow, for example), and I will ALSO keep it as its own column. However, sicne C Minor and C Major are different keys, I'll keep them sepperate. 

Tempo signature needs to be changed. It's currently a date, when in fact, the correct display would be 4/4, ... Now, 4/4 and 2/2 may both equal 1, but musically, they're not the same, as 4/4 puts more emphasis on the 3rd beat, which 2/2 doesn't do. This needs to be encoded; whenever categorical or ordinal. On the other hand, it can be argued that the difference is minimal, and thus, just using a float of the division may work. Ultimatively, due to time constraints, I decided to drop the feature, and only return to it if I had time.

In [None]:
# # one hot encode genre
# # i am doing this for correlation
# dummies = pd.get_dummies(df.genre)
# df = pd.concat([df, dummies], axis=1)

# # heat map showing correlation between multiple features (ignoring the one hot encoded features)
# plt.figure(figsize=(10,10))
# smaller = df[['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'mode','speechiness', 'tempo', 'valence', 'genre_label']]
# cor = smaller.corr(method='pearson')
# sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
# plt.show()

# # linking each numeric features w/ the genres

# numericFeatures = ['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'mode','speechiness', 'tempo', 'valence']

# # heat map showing correlation between multiple features (ignoring the one hot encoded features)
# for f in numericFeatures:
#     plt.figure(figsize=(10,5))
#     features = ['Alternative', 'Blues', "Children's Music", 'Comedy', 'Electronic', 'Folk', 'Hip-Hop', 'Movie', 'Ska', 'Soul', 'genre_label']
#     features.append(f)
#     smaller = df[features]
#     cor = smaller.corr(method='pearson')
#     sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
#     plt.show()
    
#     df.plot.scatter(x = 'genre', y = f, s = 100)
    
#     plt.figure(figsize=(10,5))
#     fig, axes = plt.subplots()
#     sns.violinplot('genre',f, data=df, ax = axes)
#     axes.set_title(f)
#     plt.show()


a**Please note that the code will crash in the above code because it was used during data analysis, the code changed and is just kept for record purposes.**

This heat map shows especially strong correlation between energy and loudness (to be expected; loud music is usually considered energetic) and high speechiness and liveness (also to be espected). Comparing the difference in other correlations between energy and loudness shows similar results, but in my opinion, they are different enough (especially in liveness, instrumentalness, and accoutsticness, as well as popularity) that they do not warrant one of them being removed. The same applies to speechiness and liveness. 

These heat maps show if one specific genre is especially correlated to a numeric feature. For example, it shows that the most correlatated genre of popularity is Hip-Hop at 0.38, etc.

Links between Features and Genres:

Popularity: Hip-Hop
Acousticness: Comedy, followed closely by Movie
Danceability: Hip-Hop
Duration_MS: All equally low.
Energy: Ska, followed by Alternative
Instrumentalness: Electronic
Liveness: Comedy
Loudness: Ska, Alternative
Mode:  Electronic
Speechiness: Comedy
Tempo: all very low
Valence: Ska