In [1]:
# imports 
import pandas as pd 
import psycopg2
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import statsmodels.api as sm
from sklearn import preprocessing
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [2]:
# create db connection to load data
conn = psycopg2.connect(
    host="localhost",
    database="movies", 
    user="docker",
    password="docker")

In [3]:
# load data into dataframe 
sql = """SELECT * FROM movie"""
cursor = conn.cursor()
cursor.execute(sql)
movies = cursor.fetchall()
cursor.close()

columns = ['id', 'title', 'release_year', 'release_month', 'imdb_id', 'tmdb_id', 'runtime', 'rating', 'budget', 'imdb_rating', 'rt_rating', 'genre_id']
movie_df = pd.DataFrame(movies, columns =columns) 

In [4]:
# load actor data
sql = """
    SELECT a.id, a.actor_performance, a.tmdb_id, a.actor_name, ma.movie_id
        from actor a
    JOIN movie_actor ma 
    	ON a.id = ma.actor_id
"""
cursor = conn.cursor()
cursor.execute(sql)
actors = cursor.fetchall()
cursor.close()

columns = ['actor_id', 'actor_performance', 'tmdb_id', 'actor_name', 'movie_id']
actor_df = pd.DataFrame(actors, columns=columns) 


In [5]:
# need to add a column that averages a movies actors performance ratings together. 
# I know it's not recommended to iterate through a dataframe but it was tricky with 
# the way I set up my data to do another way 
movie_df['actor_performance'] = ''
for index, row in movie_df.iterrows():
    movie_id = row['id']
    movie_actors = actor_df.loc[actor_df['movie_id'] == movie_id]
    actor_average_performance = movie_actors['actor_performance'].mean()
    row['actor_performance'] = actor_average_performance
    movie_df.at[index,'actor_performance'] = actor_average_performance

In [6]:
# I've got data loaded and now want to clean some of the columns. Year is 
# irrelevant to our prediction model so I'll drop that column. 
del movie_df['release_year']
# title, id, imdb_id tmdb_id will be irrelevant to the model  
del movie_df['title']
del movie_df['imdb_id']
del movie_df['tmdb_id']
del movie_df['id']
# rotten tomatoes rating is irrelevant. I collected it in case I wanted 
# to also predict that rating but will remove it for sake of time
del movie_df['rt_rating']

In [7]:
# check column values to see if enough data is present to use them in model
# the following line shows that the budget value is missing from about half of the 
# rows. This is unfortunate because I'm sure that has an influence on the success
# of a movie. Also check to make sure all movies have a imdb rating
print((movie_df['budget'] == 0).sum())
print(pd.isna(movie_df['imdb_rating']).sum())
print(pd.isna(movie_df['actor_performance']).sum())


2409
6
6


In [8]:
del movie_df['budget']
movie_df = movie_df[movie_df['imdb_rating'].notna()]
movie_df = movie_df[movie_df['actor_performance'].notna()]

In [9]:
# clean up remaining columns before building model
movie_df.rating = pd.Categorical(movie_df.rating)
movie_df = movie_df.convert_dtypes()
movie_df['imdb_rating'] = pd.to_numeric(movie_df['imdb_rating'])
movie_df.dtypes

release_month           Int64
runtime                 Int64
rating               category
imdb_rating           float64
genre_id                Int64
actor_performance     float64
dtype: object

In [10]:
# check correlations
movie_df.corr()

Unnamed: 0,release_month,runtime,imdb_rating,genre_id,actor_performance
release_month,1.0,0.075686,0.060432,0.017597,0.047477
runtime,0.075686,1.0,0.436286,0.010636,0.37203
imdb_rating,0.060432,0.436286,1.0,0.013098,0.664407
genre_id,0.017597,0.010636,0.013098,1.0,0.064042
actor_performance,0.047477,0.37203,0.664407,0.064042,1.0


Our variable correlation to imdb_rating does not look very good. Actor performance is at .6644 which is very low in my opinion. Going to continue on with building the models. 

In [11]:
# split into train and test
x = movie_df[['release_month', 'runtime', 'imdb_rating', 'genre_id', 'actor_performance']]
y = movie_df['imdb_rating']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 465)

In [12]:
model = LinearRegression()
model.fit(x_train, y_train)
print(model.coef_)

[ 1.16787642e-16 -3.85108612e-16  1.00000000e+00 -8.67361738e-17
 -2.06892880e-16]


In [13]:
# predictions and performance
predictions = model.predict(x_test)
print(metrics.mean_absolute_error(y_test, predictions))
print(metrics.mean_absolute_error(y_test, predictions))
print(np.sqrt(metrics.mean_squared_error(y_test, predictions)))

9.840187567054174e-15
9.840187567054174e-15
1.2750214361728412e-14


Our RMS value is very low which indicates that our model might not fit the data well. 

Calculating p values of our coefficients. 

In [14]:
x2 = np.column_stack((movie_df['release_month'], movie_df['runtime'], movie_df['genre_id'], movie_df['actor_performance']))
y2 =  y[0:]

x3 = sm.add_constant(x2)
est = sm.OLS(y2, x3.astype(float))
est2 = est.fit()
est2.summary()


0,1,2,3
Dep. Variable:,imdb_rating,R-squared:,0.484
Model:,OLS,Adj. R-squared:,0.484
Method:,Least Squares,F-statistic:,1166.0
Date:,"Sun, 11 Oct 2020",Prob (F-statistic):,0.0
Time:,21:28:04,Log-Likelihood:,-5701.2
No. Observations:,4977,AIC:,11410.0
Df Residuals:,4972,BIC:,11450.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.5133,0.131,-19.149,0.000,-2.771,-2.256
x1,0.0053,0.003,1.631,0.103,-0.001,0.012
x2,0.0120,0.001,19.816,0.000,0.011,0.013
x3,-0.0002,8.8e-05,-2.637,0.008,-0.000,-5.96e-05
x4,1.1487,0.022,53.109,0.000,1.106,1.191

0,1,2,3
Omnibus:,285.219,Durbin-Watson:,1.975
Prob(Omnibus):,0.0,Jarque-Bera (JB):,455.662
Skew:,-0.472,Prob(JB):,1.13e-99
Kurtosis:,4.143,Cond. No.,2120.0


The p values of our coefficient our all within an acceptable range (<.05) except for release_month. 

Overall our linear model does not seem to be a good fit. It seems there is a lot of variance between our predicted values and training values of our data. Let's see if a neural network improves performance at all. 

In [15]:
# transform x variables to be between 0 and 1
min_max_scaler = preprocessing.MinMaxScaler()
x_scale = min_max_scaler.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x_scale, y, test_size=0.2)

In [24]:
# Set the "architecture" of our neural model to be 3 layers with 32 nodes in each layer

# create model
model = Sequential()
# Adding the input layer and the first hidden layer
model.add(Dense(32, activation = 'relu', input_dim = 5))

# Adding the second hidden layer
model.add(Dense(units = 32, activation = 'relu'))

# Adding the third hidden layer
model.add(Dense(units = 32, activation = 'relu'))
model.add(Dense(units = 1, activation = 'relu'))
# Compile model
model.compile(loss='mean_squared_error', optimizer='adam')

# hist = neural_model.fit(x_train, y_train, batch_size=32, epochs=10, validation_data=(x_test, y_test))

In [25]:
model.fit(x_train, y_train, batch_size = 10, epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x2216cc45e50>

In [17]:
# print('derp')
# estimator = KerasRegressor(build_fn=baseline_model, epochs=10, batch_size=5, verbose=0)
# kfold = KFold(n_splits=10)
# results = cross_val_score(estimator, x_scale, y, cv=kfold)
# print("Baseline: %.2f (%.2f) MSE" % (results.mean(), results.std()))

derp
Baseline: -0.02 (0.03) MSE


In [26]:
y_pred = model.predict(x_test)