### Logistic Regression with instumentalness and energy

#### Modules

In [2]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

#### Prepare the Data 

In [15]:
# read data
data = pd.read_pickle('merged_data.pkl') 

# identfiy numerical volumns
columns = ['energy','instrumentalness','is_hit']

df = data[columns] # get numerical columns
df.head(1)

Unnamed: 0,energy,instrumentalness,is_hit
0,0.428,3e-06,0


In [7]:
X = df.drop('is_hit', axis=1) # features for model
y = df['is_hit'] # target variable

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

#### Fitting the model

In [10]:
# split data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)

# fit L1-regularized logistic regression on training set
logRegr = LogisticRegression(solver='lbfgs').fit(x_train, y_train)

In [11]:
score = logRegr.score(x_test, y_test)
error_rate = 1 - score
print("Prediction accuracy:", 1 - error_rate)

Prediction accuracy: 0.8880982748204337


In [12]:
# cross-validation
rfc_cv_score = cross_val_score(logRegr, X, y, cv=10, scoring="roc_auc")

# predictions
y_predict = logRegr.predict(x_test)

In [13]:
print("=== Confusion Matrix ===")
# print(confusion_matrix(y_test, y_predict))
x = pd.crosstab(y_test, y_predict, rownames=['Actual'], colnames=['Predicted'])
print(x)
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, y_predict))
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Logistic Regression ", rfc_cv_score.mean())

=== Confusion Matrix ===
Predicted      0
Actual          
0          13230
1           1667


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     13230
           1       0.00      0.00      0.00      1667

   micro avg       0.89      0.89      0.89     14897
   macro avg       0.44      0.50      0.47     14897
weighted avg       0.79      0.89      0.84     14897

=== All AUC Scores ===
[0.53285884 0.54385112 0.55498265 0.54186823 0.52798842 0.57502272
 0.55730395 0.52998026 0.47189594 0.52515252]


=== Mean AUC Score ===
Mean AUC Score - Logistic Regression  0.5360904658786663


  'precision', 'predicted', average, warn_for)
