# Logistic Regression with all the variables

#### Modules

In [133]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

#### Prepare the Data and Fit All the Variables

In [145]:
# read data
data = pd.read_pickle('merged_data.pkl') 

# identfiy numerical volumns
columns = ['danceability','energy','key','loudness','mode',
 'speechiness','acousticness','instrumentalness',
 'liveness','valence','tempo','time_signature','disc_number','duration_ms','is_hit']

df = data[columns] # get numerical columns
df.head(1)

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,disc_number,duration_ms,is_hit
0,0.67,0.428,3,-10.161,0,0.0369,0.537,3e-06,0.118,0.326,100.033,4,1,289596,0


In [135]:
X = df.drop('is_hit', axis=1) # features for model
y = df['is_hit'] # target variable

In [139]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [137]:
# split data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)

# fit L1-regularized logistic regression on training set
logRegr = LogisticRegression(solver='lbfgs').fit(x_train, y_train)

In [138]:
score = logRegr.score(x_test, y_test)
error_rate = 1 - score
print("Prediction accuracy:", 1 - error_rate)

Prediction accuracy: 0.8880982748204337


In [126]:
# cross-validation
rfc_cv_score = cross_val_score(logRegr, X, y, cv=10, scoring="roc_auc")

# predictions
y_predict = logRegr.predict(x_test)

In [127]:
print("=== Confusion Matrix ===")
# print(confusion_matrix(y_test, y_predict))
x = pd.crosstab(y_test, y_predict, rownames=['Actual'], colnames=['Predicted'])
print(x)
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, y_predict))
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Logistic Regression ", rfc_cv_score.mean())

=== Confusion Matrix ===
Predicted      0
Actual          
0          13230
1           1667


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     13230
           1       0.00      0.00      0.00      1667

   micro avg       0.89      0.89      0.89     14897
   macro avg       0.44      0.50      0.47     14897
weighted avg       0.79      0.89      0.84     14897

=== All AUC Scores ===
[0.55954285 0.5272056  0.57737906 0.60082517 0.58614531 0.61554357
 0.60745836 0.56391268 0.54028699 0.55168529]


=== Mean AUC Score ===
Mean AUC Score - Logistic Regression  0.5729984895464244


  'precision', 'predicted', average, warn_for)
