In [1]:
import pandas as pd

In [2]:
# Read in all the data
workout = pd.read_csv('./analysis_data/workout.csv',index_col = 0)
sad = pd.read_csv('./analysis_data/sad.csv',index_col = 0)
happy = pd.read_csv('./analysis_data/happy.csv',index_col = 0)

In [3]:
# Double check that there are no null values in any of the mood dfs
sad.isnull().sum()

name                0
album               0
artist              0
release_date        0
length              0
popularity          0
danceability        0
acousticness        0
danceability.1      0
energy              0
instrumentalness    0
liveness            0
loudness            0
speechiness         0
tempo               0
time_signature      0
mood                0
dtype: int64

In [4]:
# How many duplicates are in each mood df
print(sad.duplicated().sum())
print(happy.duplicated().sum())
print(workout.duplicated().sum())

1293
130
75


In [5]:
# Drop the duplicates from individual dfs
sad = sad.drop_duplicates()
happy = happy.drop_duplicates()
workout = workout.drop_duplicates()

In [6]:
# Confirm no duplicates
print(sad.duplicated().sum())
print(happy.duplicated().sum())
print(workout.duplicated().sum())

0
0
0


In [7]:
# Merge all dataframes into one central df
music_total = pd.concat([happy,sad,workout])
count = music_total['length'].count()
print(count)

3833


In [8]:
# How many songs were assigned to multiple moods?
print(music_total.duplicated(subset=['name','album','artist']).sum())

244


In [9]:
clean_music_total = music_total.drop_duplicates(
    subset=['name','album','artist'], keep='first')

In [10]:
new_count = clean_music_total['length'].count()
print(count - new_count)
print('Filtered dataframe is: ', new_count, ' songs long.')

244
Filtered dataframe is:  3589  songs long.


In [11]:
# Checking the data for top sad songs
clean_music_total['name'].value_counts()

Forever            6
All I Want         6
Hold On            5
Chasing Cars       5
Broken             5
                  ..
Your Dog           1
Mystery of Love    1
Columbia River     1
Eugene             1
goosebumps         1
Name: name, Length: 3277, dtype: int64

In [12]:
# Checking to see what the "duplicates" are
clean_music_total[clean_music_total['name']=='All I Want']

Unnamed: 0,name,album,artist,release_date,length,popularity,danceability,acousticness,danceability.1,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,mood
279,All I Want,In A Perfect World (Expanded Edition),Kodaline,2013-06-17,305746,82,0.188,0.174,0.188,0.411,0.153,0.0843,-9.733,0.0484,187.376,3,1
631,All I Want,High School Musical: The Musical: The Series (...,Various Artists,2020-01-10,177322,73,0.376,0.0902,0.376,0.43,0.0,0.0912,-6.585,0.0328,77.599,3,1
818,All I Want,In A Perfect World,Kodaline,2013-10-08,305746,0,0.209,0.172,0.209,0.412,0.15,0.0843,-9.733,0.0443,86.26,3,1
1955,All I Want,The Fault In Our Stars: Music From The Motion ...,Various Artists,2014-05-19,305946,0,0.299,0.146,0.299,0.432,0.17,0.0782,-9.099,0.0384,125.125,4,1
2108,All I Want,The Kodaline EP,Kodaline,2012-09-07,306600,0,0.189,0.114,0.189,0.426,0.0885,0.058,-9.084,0.045,187.281,3,1
3448,All I Want,In A Perfect World (Deluxe),Kodaline,2013-06-14,305746,1,0.188,0.174,0.188,0.411,0.153,0.0843,-9.733,0.0484,187.376,3,1


In [31]:
# Drop the name and album columns
cmt = clean_music_total.drop(columns=['name','album',
                                      'release_date','danceability.1','artist'])
cmt

Unnamed: 0,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,mood
0,191706,70,0.561,0.000715,0.741,0.000014,0.0822,-4.239,0.0419,139.001,4,2
1,239600,84,0.733,0.145000,0.710,0.115000,0.0956,-5.849,0.0292,127.975,4,2
2,218013,80,0.560,0.008470,0.936,0.000000,0.1610,-5.835,0.0439,112.960,4,2
3,193106,67,0.591,0.060500,0.831,0.000047,0.1900,-5.647,0.0415,144.084,4,2
4,182693,72,0.695,0.006470,0.818,0.000005,0.0219,-5.379,0.0334,119.965,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...
650,242965,91,0.849,0.063500,0.424,0.000000,0.0834,-9.579,0.3240,145.887,4,0
651,175720,84,0.598,0.054600,0.427,0.000006,0.2100,-8.764,0.0317,76.469,4,0
653,132780,1,0.778,0.175000,0.695,0.000000,0.1500,-6.865,0.0913,149.996,4,0
660,221979,89,0.767,0.181000,0.438,0.000000,0.1420,-8.726,0.2900,86.975,4,0


In [15]:
# # Encode string columns
# df_enc = pd.get_dummies(cmt,columns=['artist'])
# df_enc

Unnamed: 0,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,...,artist_the bootleg boy,artist_toastool,artist_weeklyn,artist_whiterosemoxie,artist_will.i.am,artist_yaeow,artist_yes I M,artist_yung van,artist_zukrai,artist_Öwnboss
0,191706,70,0.561,0.000715,0.741,0.000014,0.0822,-4.239,0.0419,139.001,...,0,0,0,0,0,0,0,0,0,0
1,239600,84,0.733,0.145000,0.710,0.115000,0.0956,-5.849,0.0292,127.975,...,0,0,0,0,0,0,0,0,0,0
2,218013,80,0.560,0.008470,0.936,0.000000,0.1610,-5.835,0.0439,112.960,...,0,0,0,0,0,0,0,0,0,0
3,193106,67,0.591,0.060500,0.831,0.000047,0.1900,-5.647,0.0415,144.084,...,0,0,0,0,0,0,0,0,0,0
4,182693,72,0.695,0.006470,0.818,0.000005,0.0219,-5.379,0.0334,119.965,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
650,242965,91,0.849,0.063500,0.424,0.000000,0.0834,-9.579,0.3240,145.887,...,0,0,0,0,0,0,0,0,0,0
651,175720,84,0.598,0.054600,0.427,0.000006,0.2100,-8.764,0.0317,76.469,...,0,0,0,0,0,0,0,0,0,0
653,132780,1,0.778,0.175000,0.695,0.000000,0.1500,-6.865,0.0913,149.996,...,0,0,0,0,0,0,0,0,0,0
660,221979,89,0.767,0.181000,0.438,0.000000,0.1420,-8.726,0.2900,86.975,...,0,0,0,0,0,0,0,0,0,0


In [32]:
# See breakdown of mood data points in dataset
cmt['mood'].value_counts()

1    2076
2    1016
0     497
Name: mood, dtype: int64

In [33]:
# Split data into training and testing

# Create our features
x_cols = [i for i in cmt.columns if i not in ('mood')]
X = cmt[x_cols]

# Create our target
y = df_enc['mood']

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [35]:
X.columns

Index(['length', 'popularity', 'danceability', 'acousticness', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo',
       'time_signature'],
      dtype='object')

In [36]:
y.value_counts()

1    2076
2    1016
0     497
Name: mood, dtype: int64

## Balanced Random Forest Classifier

In [37]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf_model = brf_model.fit(X_train, y_train)
y_pred = brf_model.predict(X_test)

In [38]:
# Calculated the balanced accuracy score (0.7885466545953005)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.6681514476614699

In [39]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 59,  13,  44],
       [ 63, 375,  77],
       [ 68,  33, 166]])

In [40]:
# Print out report (my data analysis)
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.31      0.51      0.83      0.39      0.65      0.41       116
          1       0.89      0.73      0.88      0.80      0.80      0.63       515
          2       0.58      0.62      0.81      0.60      0.71      0.49       267

avg / total       0.72      0.67      0.85      0.69      0.75      0.56       898



## Results for Random Forest
- Workout. Our algorithm has low precision (31%) and low recall (51%)
- Sad. High precision (89%) and decent recall (73%).
- Happy. Relatively mediocre precision (58%) and recall (62%). 

Overall, the Random Forest performs slightly better than the AdaBoost model (see below).

In [41]:
# Calculate feature importance in the Random Forest model.
importances = brf_model.feature_importances_
importances

array([0.07565149, 0.07415291, 0.10880735, 0.15625902, 0.18487275,
       0.04278679, 0.07166616, 0.09896272, 0.08773962, 0.09353906,
       0.00556213])

In [42]:
# We can sort the features by their importance.
sorted(zip(brf_model.feature_importances_, X.columns), reverse=True)

[(0.18487275177773113, 'energy'),
 (0.15625902308355052, 'acousticness'),
 (0.10880735003897564, 'danceability'),
 (0.09896272286845698, 'loudness'),
 (0.09353905559823547, 'tempo'),
 (0.08773962228028076, 'speechiness'),
 (0.07565148689282589, 'length'),
 (0.07415290557775549, 'popularity'),
 (0.0716661583807949, 'liveness'),
 (0.04278678866551852, 'instrumentalness'),
 (0.005562134835874629, 'time_signature')]

## Ensemble AdaBoost Classifier

In [43]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier 
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec = eec.fit(X_train, y_train)
y_pred = eec.predict(X_test)

In [44]:
# Calculated the balanced accuracy score
accuracy_score(y_test, y_pred)

0.6547884187082406

In [45]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 55,  15,  46],
       [ 68, 377,  70],
       [ 80,  31, 156]])

In [46]:
# 0 = Workout, 1 = Sad, 2= Happy
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.27      0.47      0.81      0.34      0.62      0.37       116
          1       0.89      0.73      0.88      0.80      0.80      0.63       515
          2       0.57      0.58      0.82      0.58      0.69      0.47       267

avg / total       0.72      0.65      0.85      0.68      0.75      0.55       898



## Results for Gradient Boosting/AdaBooster
- Workout. Our algorithm has low precision (27%) and low recall (47%)
- Sad. High precision (89%) and decent recall (73%).
- Happy. Relatively mediocre precision (57%) and recall (58%). 

## Notes

Multiclass Classification: A classification task with more than two classes; e.g., classify a set of images of fruits which may be oranges, apples, or pears. Multi-class classification makes the assumption that each sample is assigned to one and only one label: a fruit can be either an apple or a pear but not both at the same time.