## Input Dataset

In [1]:
import pandas as pd

In [2]:
sad_df = pd.read_csv('../InputData/sad.csv')
happy_df = pd.read_csv('../InputData/happy.csv')
workout_df = pd.read_csv('../InputData/workout.csv')
happy_df = pd.concat([happy_df,workout_df])

### 0 = happy, 1 = sad

In [3]:
mood_sad = ['Sad']*sad_df.shape[0]
mood_happy = ['Happy']*happy_df.shape[0]
sad_df['Moods'] = mood_sad
happy_df['Moods'] = mood_happy
sad_df['mood'] = ['1']*sad_df.shape[0]
happy_df['mood'] = ['0']*happy_df.shape[0]

In [4]:
test_df = pd.concat([sad_df,happy_df]).reset_index(drop=True)
print(test_df.shape)
test_df.head()

(5331, 19)


Unnamed: 0.1,Unnamed: 0,name,album,artist,release_date,length,popularity,danceability,acousticness,danceability.1,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,mood,Moods
0,0,everything i wanted,everything i wanted,Billie Eilish,2019-11-13,245425,82,0.704,0.902,0.704,0.225,0.657,0.106,-14.454,0.0994,120.006,4,1,Sad
1,1,ghostin,"thank u, next",Ariana Grande,2019-02-08,271466,69,0.287,0.418,0.287,0.364,1.8e-05,0.185,-8.295,0.0306,103.777,4,1,Sad
2,2,Too Good At Goodbyes,The Thrill Of It All (Special Edition),Sam Smith,2017-11-03,201000,74,0.681,0.64,0.681,0.372,0.0,0.169,-8.237,0.0432,91.873,4,1,Sad
3,3,i love you,"WHEN WE ALL FALL ASLEEP, WHERE DO WE GO?",Billie Eilish,2019-03-29,291796,80,0.421,0.952,0.421,0.131,0.00453,0.109,-18.435,0.0382,137.446,4,1,Sad
4,4,I Fall Apart,Stoney (Deluxe),Post Malone,2016-12-09,223346,80,0.556,0.0689,0.556,0.538,0.0,0.196,-5.408,0.0382,143.95,4,1,Sad


In [5]:
test_df.to_csv("../InputData/happy_sad_dataset.csv",sep = ',')

## Machine Learning Model

### Initial Works

In [6]:
# Initial imports
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [7]:
X = test_df.drop(labels=['Unnamed: 0','name','album','artist','release_date','mood','length','danceability.1','Moods','loudness'],axis=1)
y = test_df['mood']

In [8]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [9]:
# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

In [11]:
# define the multinomial logistic regression model
model = LogisticRegression(solver='lbfgs')
model.fit(X_train_scaled,y_train)
# report the model performance
y_pred = model.predict(X_test_scaled)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Logistic regression model accuracy: 0.848


In [12]:

# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, y_pred)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,365,81
Actual 1,121,766


Accuracy Score : 0.8484621155288822
Classification Report
              precision    recall  f1-score   support

           0       0.75      0.82      0.78       446
           1       0.90      0.86      0.88       887

    accuracy                           0.85      1333
   macro avg       0.83      0.84      0.83      1333
weighted avg       0.85      0.85      0.85      1333



In [14]:
X_total_scale = X_scaler.transform(X)
y_logisticReg_pred = model.predict(X_total_scale)
for index,row in test_df.iterrows():
    if row['mood'] == y_logisticReg_pred[index]:
        test_df.loc[index,'Logistc_Results'] = 'True'
    else:
        test_df.loc[index,'Logistc_Results'] = 'False'


## Decision Tree

In [16]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

# Fitting the model
model = model.fit(X_train_scaled, y_train)

# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,348,98
Actual 1,122,765


Accuracy Score : 0.8349587396849212
Classification Report
              precision    recall  f1-score   support

           0       0.74      0.78      0.76       446
           1       0.89      0.86      0.87       887

    accuracy                           0.83      1333
   macro avg       0.81      0.82      0.82      1333
weighted avg       0.84      0.83      0.84      1333



In [17]:
y_DT_pred = model.predict(X_total_scale)
for index,row in test_df.iterrows():
    if row['mood'] == y_DT_pred[index]:
        test_df.loc[index,'DT_Results'] = 'True'
    else:
        test_df.loc[index,'DT_Results'] = 'False'


## Random Forest

In [19]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,373,73
Actual 1,86,801


Accuracy Score : 0.8807201800450113
Classification Report
              precision    recall  f1-score   support

           0       0.81      0.84      0.82       446
           1       0.92      0.90      0.91       887

    accuracy                           0.88      1333
   macro avg       0.86      0.87      0.87      1333
weighted avg       0.88      0.88      0.88      1333



In [21]:
y_RF_pred = rf_model.predict(X_total_scale)
for index,row in test_df.iterrows():
    if row['mood'] == y_RF_pred[index]:
        test_df.loc[index,'RF_Results'] = 'True'
    else:
        test_df.loc[index,'RF_Results'] = 'False'

In [23]:
test_df.to_csv("../InputData/happy_sad_dataset_with_ML.csv",sep = ',')

### AdaBoost

In [24]:
from sklearn.ensemble import GradientBoostingClassifier
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
   classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=learning_rate,
   max_features=5,
   max_depth=3,
   random_state=0)
   classifier.fit(X_train_scaled, y_train)
   print("Learning rate: ", learning_rate)
   print("Accuracy score (training): {0:.3f}".format(
       classifier.score(
           X_train_scaled,
           y_train)))
   print("Accuracy score (validation): {0:.3f}".format(
       classifier.score(
           X_test_scaled,
           y_test)))

Learning rate:  0.05
Accuracy score (training): 0.860
Accuracy score (validation): 0.847
Learning rate:  0.1
Accuracy score (training): 0.870
Accuracy score (validation): 0.860
Learning rate:  0.25
Accuracy score (training): 0.881
Accuracy score (validation): 0.854
Learning rate:  0.5
Accuracy score (training): 0.896
Accuracy score (validation): 0.857
Learning rate:  0.75
Accuracy score (training): 0.902
Accuracy score (validation): 0.849
Learning rate:  1
Accuracy score (training): 0.906
Accuracy score (validation): 0.848


In [25]:
classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=0.1, max_features=5, max_depth=3, random_state=0)

classifier.fit(X_train_scaled, y_train)
predictions = classifier.predict(X_test_scaled)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.859714928732183


In [26]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,366,80
Actual 1,107,780


In [27]:
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,366,80
Actual 1,107,780


Accuracy Score : 0.859714928732183
Classification Report
              precision    recall  f1-score   support

           0       0.77      0.82      0.80       446
           1       0.91      0.88      0.89       887

    accuracy                           0.86      1333
   macro avg       0.84      0.85      0.84      1333
weighted avg       0.86      0.86      0.86      1333

