### Logistic Regression Model with Valence, Liveness, and Loudness

#### Modules

In [46]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
import warnings; warnings.simplefilter('ignore')

#### Prepare the Data and Fit All the Variables

In [47]:
# read data
data = pd.read_pickle('merged_data.pkl') 

# identfiy numerical volumns
columns = ['danceability','energy','key','mode','loudness',
 'speechiness','acousticness','instrumentalness',
 'liveness','valence','tempo','time_signature','disc_number','duration_ms','is_hit']

df = data[columns] # get numerical columns
df.head(1)

Unnamed: 0,danceability,energy,key,mode,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,disc_number,duration_ms,is_hit
0,0.67,0.428,3,0,-10.161,0.0369,0.537,3e-06,0.118,0.326,100.033,4,1,289596,0


In [48]:
X = df.drop('is_hit', axis=1) # features for model
y = df['is_hit'] # target variable

In [49]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [50]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)

In [51]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif

In [52]:
feature_select = SelectKBest(f_classif, k = 3)
feature_select.fit(x_train, y_train)
x_train_chi = feature_select.transform(x_train)
x_test_chi = feature_select.transform(x_test)
print(feature_select.scores_)

[19.42836372 19.67783236  0.80153495  0.19006025 35.55530855 25.58692309
  9.21591105 29.96127582 65.89550744 68.67132056 15.75334509  0.70304927
  9.62383333 19.41414103]


##### Top features are Valence, Liveness, and Loudness

In [53]:
logRegr2 = LogisticRegression(solver='lbfgs')
logRegr2.fit(x_train_chi, y_train)
score2 = logRegr2.score(x_test_chi, y_test)
error_rate2 = 1 - score2
1 - error_rate2

0.8880982748204337

In [54]:
# cross-validation
rfc_cv_score = cross_val_score(logRegr2, X, y, cv=10, scoring="roc_auc")

# predictions
y_predict = logRegr2.predict(x_test_chi)

In [55]:
print("=== Confusion Matrix ===")
# print(confusion_matrix(y_test, y_predict))
x = pd.crosstab(y_test, y_predict, rownames=['Actual'], colnames=['Predicted'])
print(x)
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, y_predict))
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Logistic Regression ", rfc_cv_score.mean())

=== Confusion Matrix ===
Predicted      0
Actual          
0          13230
1           1667


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     13230
           1       0.00      0.00      0.00      1667

   micro avg       0.89      0.89      0.89     14897
   macro avg       0.44      0.50      0.47     14897
weighted avg       0.79      0.89      0.84     14897

=== All AUC Scores ===
[0.55954285 0.5272056  0.57737906 0.60082517 0.58614531 0.61554357
 0.60745836 0.56391268 0.54028699 0.55168367]


=== Mean AUC Score ===
Mean AUC Score - Logistic Regression  0.5729983271398184
