In [1]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import pandas as pd
from sklearn import linear_model
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [2]:
# Create connection to AWS table

engine = create_engine('postgresql+psycopg2://postgres:moviesondemand@moviesondemandaws.cfwjiare7kds.us-east-2.rds.amazonaws.com:5432/postgres')

In [3]:
# Create dataframe for main data to be used for modeling using connection with AWS
main_df = pd.read_sql_table('consolidated_pre_transformation', con=engine)

In [4]:
# Preview main dataframe

main_df.head()

Unnamed: 0,original_title,votes,title_length,month_number,genre,duration,imdb_rating,budget_millions,oscar_nominated_actors_count,is_oscar_directed,mpa_rating
0,Kate & Leopold,77852,14,3,"Comedy, Fantasy, Romance",118,6.4,48,0,0,PG-13
1,The Omen,55534,18,6,"Action, Adventure, Fantasy",110,5.5,25,0,0,R
2,Brooklyn Rules,5441,14,5,"Crime, Drama",99,6.3,8,0,0,R
3,Straw Dogs,32371,10,9,"Action, Drama, Thriller",110,5.8,25,1,0,R
4,How Do You Know,45573,11,2,"Comedy, Drama, Romance",121,5.4,120,1,0,PG-13


In [5]:

#Split out genre into columns (manual dummy variable creation)

main_df['isHorror'] = main_df['genre'].apply(lambda x: 1 if 'Horror' in x else 0)
main_df['isComedy'] = main_df['genre'].apply(lambda x: 1 if 'Comedy' in x else 0)
main_df['isRomance'] = main_df['genre'].apply(lambda x: 1 if 'Romance' in x else 0)
main_df['isFantasy'] = main_df['genre'].apply(lambda x: 1 if 'Fantasy' in x else 0)
main_df['isWestern'] = main_df['genre'].apply(lambda x: 1 if 'Western' in x else 0)
main_df['isWar'] = main_df['genre'].apply(lambda x: 1 if 'War' in x else 0)
main_df['isHistory'] = main_df['genre'].apply(lambda x: 1 if 'History' in x else 0)
main_df['isDrama'] = main_df['genre'].apply(lambda x: 1 if 'Drama' in x else 0)
main_df['isSport'] = main_df['genre'].apply(lambda x: 1 if 'Sport' in x else 0)
main_df['isMusic'] = main_df['genre'].apply(lambda x: 1 if 'Music' in x else 0)
main_df['isMusical'] = main_df['genre'].apply(lambda x: 1 if 'Musical' in x else 0)
main_df['isAnimation'] = main_df['genre'].apply(lambda x: 1 if 'Animation' in x else 0)
main_df['isBiography'] = main_df['genre'].apply(lambda x: 1 if 'Biography' in x else 0)
main_df['isFamily'] = main_df['genre'].apply(lambda x: 1 if 'Family' in x else 0)
main_df['isSci-Fi'] = main_df['genre'].apply(lambda x: 1 if 'Sci-Fi' in x else 0)
main_df['isMystery'] = main_df['genre'].apply(lambda x: 1 if 'Mystery' in x else 0)
main_df['isAdventure'] = main_df['genre'].apply(lambda x: 1 if 'Adventure' in x else 0)
main_df['isCrime'] = main_df['genre'].apply(lambda x: 1 if 'Crime' in x else 0)
main_df['isThirller'] = main_df['genre'].apply(lambda x: 1 if 'Thriller' in x else 0)
main_df['isAction'] = main_df['genre'].apply(lambda x: 1 if 'Action' in x else 0)

In [6]:
#Create dummy variables for MPAA rating
mpaa_dummies_df = pd.get_dummies(main_df['mpa_rating'])
mpaa_dummies_df

Unnamed: 0,G,NC-17,PG,PG-13,R,Unrated
0,0,0,0,1,0,0
1,0,0,0,0,1,0
2,0,0,0,0,1,0
3,0,0,0,0,1,0
4,0,0,0,1,0,0
...,...,...,...,...,...,...
2721,0,0,0,0,1,0
2722,0,0,0,0,1,0
2723,0,0,0,0,1,0
2724,0,0,1,0,0,0


In [7]:
#Add MPAA rating dummy variables to main dataframe
dummies_main_df = pd.concat([main_df, mpaa_dummies_df] , axis = 1)
dummies_main_df

Unnamed: 0,original_title,votes,title_length,month_number,genre,duration,imdb_rating,budget_millions,oscar_nominated_actors_count,is_oscar_directed,...,isAdventure,isCrime,isThirller,isAction,G,NC-17,PG,PG-13,R,Unrated
0,Kate & Leopold,77852,14,3,"Comedy, Fantasy, Romance",118,6.4,48,0,0,...,0,0,0,0,0,0,0,1,0,0
1,The Omen,55534,18,6,"Action, Adventure, Fantasy",110,5.5,25,0,0,...,1,0,0,1,0,0,0,0,1,0
2,Brooklyn Rules,5441,14,5,"Crime, Drama",99,6.3,8,0,0,...,0,1,0,0,0,0,0,0,1,0
3,Straw Dogs,32371,10,9,"Action, Drama, Thriller",110,5.8,25,1,0,...,0,0,1,1,0,0,0,0,1,0
4,How Do You Know,45573,11,2,"Comedy, Drama, Romance",121,5.4,120,1,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2721,The Prodigy,22022,32,3,"Horror, Mystery, Thriller",92,5.8,6,0,0,...,0,0,1,0,0,0,0,0,1,0
2722,The Cabin in the Woods,366795,21,5,Horror,95,7.0,30,0,0,...,0,0,0,0,0,0,0,0,1,0
2723,The Taking,24825,10,10,"Horror, Mystery, Thriller",90,6.0,1,0,0,...,0,0,1,0,0,0,0,0,1,0
2724,Finding Dory,233601,20,9,"Animation, Adventure, Comedy",97,7.3,200,1,0,...,1,0,0,0,0,0,1,0,0,0


In [8]:
#Remove N/A rows
clean_main_df = dummies_main_df.dropna()

In [9]:
# Create a new DataFrame that holds only the title names.
df_titles = clean_main_df.filter(["original_title"], axis=1)

In [60]:
# Create final clean dataframe while removing unecessary columns
final_main_df = clean_main_df.drop([ 'original_title', 'genre', 'mpa_rating', ], axis=1)

# Not dropping votes column improved R2 Score by about 10 points

In [61]:
# Create X and y variables

X = final_main_df.drop(['imdb_rating'], axis=1)
y = final_main_df['imdb_rating']

#Split and Test Data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state = 1)

In [62]:
# Initiate linear regression model and fit model

mlr = LinearRegression()  
mlr.fit(X_train, y_train)

LinearRegression()

In [63]:
y_test_predict = mlr.predict(X_test) #y_test prediction values
y_test_predict

array([ 5.66465332,  5.63496006,  7.06782175,  5.4400629 ,  6.05305265,
        6.10469818,  7.04673397,  6.16456555,  6.07790726,  5.53575715,
        6.3717243 ,  5.57480177,  5.25055437,  5.77046565,  5.6869248 ,
        6.3564404 ,  8.03259854,  6.47969758,  6.45078868,  5.69072032,
        6.12221343,  5.97410147,  5.884445  ,  5.72579062,  5.65698291,
        7.12115703,  6.21774169,  6.59738758,  6.44953013,  6.2890546 ,
        6.18347589,  6.58428281,  6.37326229,  5.98456404,  6.11562598,
        5.3274396 ,  6.35019216,  6.15098471,  6.64725863,  7.24402129,
        6.751723  ,  6.25770597,  6.48327945,  5.59286546,  5.45940505,
        5.53842749,  6.25493487,  6.96836806,  5.58873837,  5.66840124,
        6.00783569,  6.17309511,  5.39632971,  6.03822105,  6.4409336 ,
        5.35704209,  5.90586398,  5.33242661,  5.72209989,  6.81781947,
        6.21086041,  5.36387945,  6.20352598,  5.92507034,  5.94206374,
        6.16671082,  5.48795298,  6.46711199,  6.11082095,  6.79

In [64]:
y_test

1247    4.7
2306    6.2
1607    6.0
1873    5.9
783     6.6
       ... 
681     7.3
2245    6.7
2642    6.8
2724    7.3
1143    6.7
Name: imdb_rating, Length: 682, dtype: float64

In [65]:
from sklearn.metrics import mean_squared_error, r2_score
mean_squared_error(y_test, y_test_predict) #computes the difference between correct answer and predicted answer

0.5836297021814589

In [66]:
#Evaluate performance using R Squared Score
r2_score(y_test, y_test_predict)

0.46975836128165693

In [67]:
#Mean Absolute Error (MAE)
def mae(y_true, y_pred):
    return np.mean(np.abs(y_pred - y_true)) 

print('MAE: {:.4f}'.format(mae(y_test, y_test_predict)))
# And the MAE isn't terrible either

MAE: 0.5489


In [70]:
# Create dataframe with title, actual value, and predicted value

mlr_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred_mlr})
mlr_diff['title'] = df_titles['original_title']
mlr_diff.head(40)

Unnamed: 0,Actual value,Predicted value,title
1247,4.7,5.579152,The Santa Clause 3: The Escape Clause
2306,6.2,5.623817,Daddy's Home
1607,6.0,6.985615,Shark Tale
1873,5.9,5.521867,The Super
783,6.6,5.65798,Project X
1978,7.2,6.115353,Duma
403,7.4,6.62894,Wonder Woman
1644,6.3,6.133722,Risen
2541,6.5,6.198212,Gimme Shelter
1354,6.3,5.465508,Diary of a Wimpy Kid: Dog Days


In [69]:
print('Accuracy',100- (np.mean(np.abs((y_test - y_pred_mlr) / y_test)) * 100))

Accuracy 87.64799311992441
