In [1]:
import pandas as pd
import numpy as np
from numpy.random import seed
seed(42)
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

## Data Preprocessing

In [2]:
# Read in movie csv
movies = pd.read_csv("../Resources/imdb_final_classes.csv")
movies.head()

Unnamed: 0,index,imdb_title_id,title,year,genre,duration,country,director,production_company,budget,...,under45,males,malesunder18,malesunder30,malesunder45,females,femalesunder18,femalesunder30,femalesunder45,rating_class
0,0,tt0035423,Kate & Leopold,2001,Comedy,118,USA,James Mangold,Konrad Pictures,48000000,...,6.4,6.3,6.5,6.2,6.3,6.6,6.7,6.4,6.7,Good
1,1,tt0113026,The Fantasticks,2000,Musical,86,USA,Michael Ritchie,Michael Ritchie Productions,10000000,...,5.4,5.3,5.4,5.4,5.3,5.7,5.6,5.8,5.6,Good
2,2,tt0118589,Glitter,2001,Drama,104,USA,Vondie Curtis-Hall,Twentieth Century Fox,22000000,...,2.1,1.9,2.1,1.9,2.0,2.9,3.2,2.7,2.5,Bad
3,3,tt0118652,The Attic Expeditions,2001,Comedy,100,USA,Jeremy Kasten,Tse Tse Fly Productions,1000000,...,4.6,5.0,4.7,5.1,4.6,4.8,5.8,4.6,4.6,Good
4,4,tt0120467,Vulgar,2000,Crime,87,USA,Bryan Johnson,Chango Productions,120000,...,5.0,5.3,6.1,5.3,5.1,5.2,5.2,5.3,4.6,Good


In [3]:
# Set vote column to integer type
movies["median_vote"] = movies["median_vote"].astype(int)

In [4]:
# Set values for X and y
X = movies[["year", "genre", "duration", "director", "budget"]]
y = movies["rating_class"].values.reshape(-1,1)
print(X.shape, y.shape)

(5060, 5) (5060, 1)


In [5]:
data = X.copy()
data

Unnamed: 0,year,genre,duration,director,budget
0,2001,Comedy,118,James Mangold,48000000
1,2000,Musical,86,Michael Ritchie,10000000
2,2001,Drama,104,Vondie Curtis-Hall,22000000
3,2001,Comedy,100,Jeremy Kasten,1000000
4,2000,Crime,87,Bryan Johnson,120000
...,...,...,...,...,...
5055,2019,Comedy,84,Jon Lucas,5000000
5056,2019,Drama,94,Dan Sallitt,95000
5057,2019,Action,84,Glenn Miller,100000
5058,2019,Action,92,Keoni Waxman,3000000


In [6]:
# Dummy Encoding for genre column
data_binary_encoded = pd.get_dummies(data, columns= ["genre", "director"])
data_binary_encoded.head()

Unnamed: 0,year,duration,budget,genre_Action,genre_Adventure,genre_Animation,genre_Biography,genre_Comedy,genre_Crime,genre_Drama,...,director_Zack Snyder,director_Zackary Adler,director_Zak Knutson,director_Zak Penn,director_Zebediah De Soto,director_Zia Mojabi,director_Ziad H. Hamzeh,director_Zoe Quist,director_Zoran Lisinac,director_mink
0,2001,118,48000000,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2000,86,10000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2001,104,22000000,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,2001,100,1000000,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2000,87,120000,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Create train/test split
X_train, X_test, y_train, y_test = train_test_split(data_binary_encoded, y, stratify= y, random_state=42)


In [17]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train).reshape(-1,1)
encoded_y_test = label_encoder.transform(y_test).reshape(-1,1)
encoded_y_train

  return f(**kwargs)


array([[1],
       [0],
       [0],
       ...,
       [0],
       [2],
       [2]])

In [18]:
# Create StandardScaler model and fit to training data
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(encoded_y_train)

# Transform training and testing data using X_scaler and y_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(encoded_y_train)
y_test_scaled = y_scaler.transform(encoded_y_test)
print(y_train_scaled)

[[-0.53732163]
 [-1.75544326]
 [-1.75544326]
 ...
 [-1.75544326]
 [ 0.68079999]
 [ 0.68079999]]


## Create Elastic Net Regression Model

In [19]:
# Create elasiticnet regression model
en_model = ElasticNet(alpha= 0.01, max_iter= 10000).fit(X_train_scaled, y_train_scaled)


In [20]:
training_score = en_model.score(X_train_scaled, y_train_scaled)
testing_score = en_model.score(X_test_scaled, y_test_scaled)
print(f"ElasticNet Regression Training Score: {training_score: .3f}")
print(f"ElaseticNet Regression Testing Score: {testing_score: .3f}")

ElasticNet Regression Training Score:  0.831
ElaseticNet Regression Testing Score:  0.032


In [21]:
predictions_train = en_model.predict(X_train_scaled)
predictions_test = en_model.predict(X_test_scaled)
print(predictions_test[:5])
print(y_test_scaled[:5])

[0.56688268 0.58246372 0.52591955 0.43184925 0.52865757]
[[ 0.68079999]
 [ 0.68079999]
 [-0.53732163]
 [-0.53732163]
 [ 0.68079999]]


## Grid Search

In [13]:
from sklearn.model_selection import GridSearchCV


## Plot Residuals

In [14]:
from sklearn.metrics import mean_squared_error
predictions_test = en_model.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions_test)
MSE


0.9467527067778029

## Save Model