In [1]:
import pandas as pd
import numpy as np
from numpy.random import seed
seed(42)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

## Data Preprocessing

In [None]:
# Read in movie csv
movies = pd.read_csv("../Resources/imdb_final.csv")
movies.head()

In [3]:
# Set values for X and y
X = movies[["year", "genre", "duration", "director", "budget"]]
y = movies["rating_class"].values.reshape(-1,1)
print(X.shape, y.shape)

(5060, 5) (5060, 1)


In [4]:
data = X.copy()
data

Unnamed: 0,year,genre,duration,director,budget
0,2001,Comedy,118,James Mangold,48000000
1,2000,Musical,86,Michael Ritchie,10000000
2,2001,Drama,104,Vondie Curtis-Hall,22000000
3,2001,Comedy,100,Jeremy Kasten,1000000
4,2000,Crime,87,Bryan Johnson,120000
...,...,...,...,...,...
5055,2019,Comedy,84,Jon Lucas,5000000
5056,2019,Drama,94,Dan Sallitt,95000
5057,2019,Action,84,Glenn Miller,100000
5058,2019,Action,92,Keoni Waxman,3000000


In [5]:
# Dummy Encoding for genre column
data_binary_encoded = pd.get_dummies(data, columns= ["genre", "director"])
data_binary_encoded.head()

Unnamed: 0,year,duration,budget,genre_Action,genre_Adventure,genre_Animation,genre_Biography,genre_Comedy,genre_Crime,genre_Drama,...,director_Zack Snyder,director_Zackary Adler,director_Zak Knutson,director_Zak Penn,director_Zebediah De Soto,director_Zia Mojabi,director_Ziad H. Hamzeh,director_Zoe Quist,director_Zoran Lisinac,director_mink
0,2001,118,48000000,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2000,86,10000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2001,104,22000000,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,2001,100,1000000,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2000,87,120000,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Create train/test split
X_train, X_test, y_train, y_test = train_test_split(data_binary_encoded, y, random_state=42, stratify= y)


In [13]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train).reshape(-1,1)
encoded_y_test = label_encoder.transform(y_test).reshape(-1,1)
encoded_y_train

  return f(**kwargs)


array([[1],
       [0],
       [0],
       ...,
       [0],
       [2],
       [2]])

In [14]:
# Create StandardScaler model and fit to training data
X_scaler = StandardScaler().fit(X_train)
# y_scaler = StandardScaler().fit(encoded_y_train)

# Transform training and testing data using X_scaler and y_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
# y_train_scaled = y_scaler.transform(encoded_y_train)
# y_test_scaled = y_scaler.transform(encoded_y_test)
# print(y_train_scaled)

## Create Logistic Regression Model

In [15]:
# Create elasiticnet regression model
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_scaled, encoded_y_train)


  return f(**kwargs)


LogisticRegression(max_iter=1000)

In [16]:
training_score = log_model.score(X_train_scaled, encoded_y_train)
testing_score = log_model.score(X_test_scaled, encoded_y_test)
print(f"Logistic Regression Training Score: {training_score: .3f}")
print(f"Logistic Regression Testing Score: {testing_score: .3f}")

Logistic Regression Training Score:  0.953
Logistic Regression Testing Score:  0.661


In [17]:
predictions_train = log_model.predict(X_train_scaled)
predictions_test = log_model.predict(X_test_scaled)
prediction_labels = label_encoder.inverse_transform(predictions_test)
print(prediction_labels[:100:10])
print(y_test[:100:10])

['Good' 'Good' 'Excellent' 'Good' 'Good' 'Good' 'Good' 'Good' 'Good'
 'Good']
[['Good']
 ['Excellent']
 ['Excellent']
 ['Good']
 ['Good']
 ['Bad']
 ['Good']
 ['Good']
 ['Bad']
 ['Excellent']]


In [18]:
# CLassification report
from sklearn.metrics import classification_report
print(classification_report(y_test, prediction_labels))

              precision    recall  f1-score   support

         Bad       0.57      0.18      0.28       270
   Excellent       0.39      0.20      0.26       167
        Good       0.69      0.91      0.78       828

    accuracy                           0.66      1265
   macro avg       0.55      0.43      0.44      1265
weighted avg       0.62      0.66      0.61      1265



In [19]:
# Confusion matrix
from sklearn.metrics import confusion_matrix as cm
cm(y_test, prediction_labels)

array([[ 49,   8, 213],
       [  6,  33, 128],
       [ 31,  43, 754]], dtype=int64)

## Grid Search

In [14]:
from sklearn.model_selection import GridSearchCV
C = [1, 5, 10]
max_iter = [500, 750, 1000]
param_grid = dict(C=C, max_iter=max_iter)
grid = GridSearchCV(log_model, param_grid, verbose=2, scoring = "accuracy")

In [15]:
log_model.get_params().keys()

dict_keys(['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'l1_ratio', 'max_iter', 'multi_class', 'n_jobs', 'penalty', 'random_state', 'solver', 'tol', 'verbose', 'warm_start'])

In [17]:
grid.fit(X_train_scaled, np.ravel(encoded_y_train))

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] C=1, max_iter=500 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................ C=1, max_iter=500, total=   4.3s
[CV] C=1, max_iter=500 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.2s remaining:    0.0s


[CV] ................................ C=1, max_iter=500, total=   4.2s
[CV] C=1, max_iter=500 ...............................................
[CV] ................................ C=1, max_iter=500, total=   4.4s
[CV] C=1, max_iter=500 ...............................................
[CV] ................................ C=1, max_iter=500, total=   4.6s
[CV] C=1, max_iter=500 ...............................................
[CV] ................................ C=1, max_iter=500, total=   5.0s
[CV] C=1, max_iter=750 ...............................................
[CV] ................................ C=1, max_iter=750, total=   6.9s
[CV] C=1, max_iter=750 ...............................................
[CV] ................................ C=1, max_iter=750, total=   4.7s
[CV] C=1, max_iter=750 ...............................................
[CV] ................................ C=1, max_iter=750, total=   4.5s
[CV] C=1, max_iter=750 ...............................................
[CV] .

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ............................... C=10, max_iter=500, total=  11.2s
[CV] C=10, max_iter=500 ..............................................
[CV] ............................... C=10, max_iter=500, total=   8.8s
[CV] C=10, max_iter=500 ..............................................
[CV] ............................... C=10, max_iter=500, total=   9.6s
[CV] C=10, max_iter=500 ..............................................
[CV] ............................... C=10, max_iter=500, total=   9.6s
[CV] C=10, max_iter=750 ..............................................
[CV] ............................... C=10, max_iter=750, total=   9.9s
[CV] C=10, max_iter=750 ..............................................
[CV] ............................... C=10, max_iter=750, total=  10.1s
[CV] C=10, max_iter=750 ..............................................
[CV] ............................... C=10, max_iter=750, total=   8.8s
[CV] C=10, max_iter=750 ..............................................
[CV] .

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:  5.7min finished


GridSearchCV(estimator=LogisticRegression(max_iter=1000),
             param_grid={'C': [1, 5, 10], 'max_iter': [500, 750, 1000]},
             scoring='accuracy', verbose=2)

In [18]:
print(grid.best_params_)

{'C': 5, 'max_iter': 500}


In [19]:
print(grid.best_score_)

0.6658761528326747


In [28]:
# Create elasiticnet regression model with best params
log_model2 = LogisticRegression(max_iter=500, C= 5)
log_model2.fit(X_train_scaled, np.ravel(encoded_y_train))


LogisticRegression(C=5, max_iter=500)

In [29]:
training_score = log_model2.score(X_train_scaled, encoded_y_train)
testing_score = log_model2.score(X_test_scaled, encoded_y_test)
print(f"Logistic Regression Training Score: {training_score: .3f}")
print(f"Logistic Regression Testing Score: {testing_score: .3f}")

Logistic Regression Training Score:  0.957
Logistic Regression Testing Score:  0.659


In [30]:
predictions_train = log_model2.predict(X_train_scaled)
predictions_test = log_model2.predict(X_test_scaled)
prediction_labels = label_encoder.inverse_transform(predictions_test)
print(prediction_labels[:100:10])
print(y_test[:100:10])

['Good' 'Good' 'Good' 'Good' 'Good' 'Good' 'Excellent' 'Good' 'Good'
 'Good']
[['Good']
 ['Good']
 ['Good']
 ['Excellent']
 ['Good']
 ['Good']
 ['Good']
 ['Good']
 ['Good']
 ['Good']]


## Plot Residuals

In [25]:
from sklearn.metrics import mean_squared_error
predictions_test = log_model.predict(X_test_scaled)

MSE = mean_squared_error(encoded_y_test, predictions_test)
MSE


0.9573122529644269

## Save Model

In [32]:
import joblib
filename = 'logistic.sav'
joblib.dump(log_model2, filename)


['logistic.sav']