# Model Dependencies & AWS Connection

In [1]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import pandas as pd
from sklearn import linear_model
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import scipy.stats as stats

In [2]:
# Create connection to AWS table

engine = create_engine('postgresql+psycopg2://postgres:moviesondemand@moviesondemandaws.cfwjiare7kds.us-east-2.rds.amazonaws.com:5432/postgres')

In [3]:
# Create dataframe for main data to be used for modeling using connection with AWS
main_df = pd.read_sql_table('consolidated_pre_transformation', con=engine)
raw_main_df = pd.read_sql_table('consolidated_pre_transformation', con=engine)

# Feature Transformation

In [4]:

#Split out genre into columns (manual dummy variable creation)

main_df['isHorror'] = main_df['genre'].apply(lambda x: 1 if 'Horror' in x else 0)
main_df['isComedy'] = main_df['genre'].apply(lambda x: 1 if 'Comedy' in x else 0)
main_df['isRomance'] = main_df['genre'].apply(lambda x: 1 if 'Romance' in x else 0)
main_df['isFantasy'] = main_df['genre'].apply(lambda x: 1 if 'Fantasy' in x else 0)
main_df['isWestern'] = main_df['genre'].apply(lambda x: 1 if 'Western' in x else 0)
main_df['isWar'] = main_df['genre'].apply(lambda x: 1 if 'War' in x else 0)
main_df['isHistory'] = main_df['genre'].apply(lambda x: 1 if 'History' in x else 0)
main_df['isDrama'] = main_df['genre'].apply(lambda x: 1 if 'Drama' in x else 0)
main_df['isSport'] = main_df['genre'].apply(lambda x: 1 if 'Sport' in x else 0)
main_df['isMusic'] = main_df['genre'].apply(lambda x: 1 if 'Music' in x else 0)
main_df['isMusical'] = main_df['genre'].apply(lambda x: 1 if 'Musical' in x else 0)
main_df['isAnimation'] = main_df['genre'].apply(lambda x: 1 if 'Animation' in x else 0)
main_df['isBiography'] = main_df['genre'].apply(lambda x: 1 if 'Biography' in x else 0)
main_df['isFamily'] = main_df['genre'].apply(lambda x: 1 if 'Family' in x else 0)
main_df['isSci-Fi'] = main_df['genre'].apply(lambda x: 1 if 'Sci-Fi' in x else 0)
main_df['isMystery'] = main_df['genre'].apply(lambda x: 1 if 'Mystery' in x else 0)
main_df['isAdventure'] = main_df['genre'].apply(lambda x: 1 if 'Adventure' in x else 0)
main_df['isCrime'] = main_df['genre'].apply(lambda x: 1 if 'Crime' in x else 0)
main_df['isThirller'] = main_df['genre'].apply(lambda x: 1 if 'Thriller' in x else 0)
main_df['isAction'] = main_df['genre'].apply(lambda x: 1 if 'Action' in x else 0)

In [5]:
#Create dummy variables for MPAA rating
mpaa_dummies_df = pd.get_dummies(main_df['mpa_rating'])


In [6]:
#Add MPAA rating dummy variables to main dataframe
dummies_main_df = pd.concat([main_df, mpaa_dummies_df] , axis = 1)


# Final Data Cleansing

In [7]:
#Remove N/A rows
clean_main_df = dummies_main_df.dropna()


In [8]:
# Create a new DataFrame that holds only the title names.
df_titles = clean_main_df.filter(["original_title"], axis=1)



In [9]:
# Create final clean dataframe while removing unecessary columns
final_main_df = clean_main_df.drop([ 'original_title', 'votes', 'genre', 'mpa_rating', ], axis=1)

# Ridge Regression Machine Learning Model

In [10]:

# Create X and y variables

X = final_main_df.drop(['imdb_rating' ], axis=1)
y = final_main_df['imdb_rating']
 


In [11]:
# Train, Test, & Split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [12]:
# Rescale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
# Initiate ride regression model and fit model

rr = Ridge(alpha=10)
rr.fit(X_train_scaled,y_train)



Ridge(alpha=10)

# RUN DEMO

In [14]:
resources_folder = r'../resources/'
demo_path = resources_folder + 'demo.csv'
demo_df = pd.read_csv(demo_path, low_memory=False)
demo_df_raw = demo_df

## WHAT MOVIE DO YOU THINK PREDICTS THE HIGHEST IMDB RATING

In [15]:
demo_df.head(10)

Unnamed: 0,original_title,votes,title_length,month_number,genre,duration,imdb_rating,budget_millions,oscar_nominated_actors_count,is_oscar_directed,mpa_rating
0,Cakes on a plane,0,16,8,"Action, Thriller, Crime",120,0,30,1,0,G
1,My couch pulls out but Jay Cutler doesn't,0,41,1,"Romance, Drama, Biography",90,0,5,0,1,NC-17
2,Kill Jill,0,9,6,"Action, Crime",140,0,60,1,0,R
3,Hank's Redemption,0,17,12,"Drama, History,",190,0,50,3,0,PG
4,Apocalypse Yesterday,0,20,11,"Action, Horror, Thriller",180,0,120,2,1,R
5,No Country for Boomers,0,22,10,"Action, Drama, Thriller",150,0,105,1,0,Unrated
6,Tears Of The Moon,0,17,7,"Action, War, Thriller",140,0,140,1,1,R
7,Blade Jogger,0,12,5,"Action, Adventure, Drama",144,0,60,2,0,R
8,10 Things I Sort Of Don't Like About You,0,40,9,"Comedy, Romance, Drama",110,0,15,1,1,PG-13


In [16]:
demo_df['isHorror'] = demo_df['genre'].apply(lambda x: 1 if 'Horror' in x else 0)
demo_df['isComedy'] = demo_df['genre'].apply(lambda x: 1 if 'Comedy' in x else 0)
demo_df['isRomance'] = demo_df['genre'].apply(lambda x: 1 if 'Romance' in x else 0)
demo_df['isFantasy'] = demo_df['genre'].apply(lambda x: 1 if 'Fantasy' in x else 0)
demo_df['isWestern'] = demo_df['genre'].apply(lambda x: 1 if 'Western' in x else 0)
demo_df['isWar'] = demo_df['genre'].apply(lambda x: 1 if 'War' in x else 0)
demo_df['isHistory'] = demo_df['genre'].apply(lambda x: 1 if 'History' in x else 0)
demo_df['isDrama'] = demo_df['genre'].apply(lambda x: 1 if 'Drama' in x else 0)
demo_df['isSport'] = demo_df['genre'].apply(lambda x: 1 if 'Sport' in x else 0)
demo_df['isMusic'] = demo_df['genre'].apply(lambda x: 1 if 'Music' in x else 0)
demo_df['isMusical'] = demo_df['genre'].apply(lambda x: 1 if 'Musical' in x else 0)
demo_df['isAnimation'] = demo_df['genre'].apply(lambda x: 1 if 'Animation' in x else 0)
demo_df['isBiography'] = demo_df['genre'].apply(lambda x: 1 if 'Biography' in x else 0)
demo_df['isFamily'] = demo_df['genre'].apply(lambda x: 1 if 'Family' in x else 0)
demo_df['isSci-Fi'] = demo_df['genre'].apply(lambda x: 1 if 'Sci-Fi' in x else 0)
demo_df['isMystery'] = demo_df['genre'].apply(lambda x: 1 if 'Mystery' in x else 0)
demo_df['isAdventure'] = demo_df['genre'].apply(lambda x: 1 if 'Adventure' in x else 0)
demo_df['isCrime'] = demo_df['genre'].apply(lambda x: 1 if 'Crime' in x else 0)
demo_df['isThirller'] = demo_df['genre'].apply(lambda x: 1 if 'Thriller' in x else 0)
demo_df['isAction'] = demo_df['genre'].apply(lambda x: 1 if 'Action' in x else 0)
mpaa_demo_dummies = pd.get_dummies(demo_df['mpa_rating'])
demo_with_dummies = pd.concat([demo_df, mpaa_demo_dummies] , axis = 1)
clean_demo_df = demo_with_dummies.dropna()
df_demo_titles = clean_demo_df.filter(["original_title"], axis=1)
final_demo_df = clean_demo_df.drop([ 'original_title', 'votes', 'genre', 'mpa_rating', ], axis=1)
X_demo = final_demo_df.drop(['imdb_rating' ], axis=1)
X_demo_scaled = scaler.fit_transform(X_demo)
demo_predictions = rr.predict(X_demo_scaled)
demo_results = pd.DataFrame({'Predicted value': demo_predictions})
demo_results['title'] = df_demo_titles['original_title']
demo_results_complete = demo_results.reindex(columns=['title' , 'Predicted value'])
demo_results_main = pd.merge(demo_results_complete, demo_df_raw, how="left", left_on=['title'],right_on=['original_title'])
demo_results_main.sort_values(by=['Predicted value'], ascending = False).head(20)

Unnamed: 0,title,Predicted value,original_title,votes,title_length,month_number,genre,duration,imdb_rating,budget_millions,...,isMusical,isAnimation,isBiography,isFamily,isSci-Fi,isMystery,isAdventure,isCrime,isThirller,isAction
3,Hank's Redemption,7.269155,Hank's Redemption,0,17,12,"Drama, History,",190,0,50,...,0,0,0,0,0,0,0,0,0,0
7,Blade Jogger,6.625111,Blade Jogger,0,12,5,"Action, Adventure, Drama",144,0,60,...,0,0,0,0,0,0,1,0,0,1
4,Apocalypse Yesterday,6.489125,Apocalypse Yesterday,0,20,11,"Action, Horror, Thriller",180,0,120,...,0,0,0,0,0,0,0,0,1,1
5,No Country for Boomers,6.478017,No Country for Boomers,0,22,10,"Action, Drama, Thriller",150,0,105,...,0,0,0,0,0,0,0,0,1,1
6,Tears Of The Moon,6.249521,Tears Of The Moon,0,17,7,"Action, War, Thriller",140,0,140,...,0,0,0,0,0,0,0,0,1,1
2,Kill Jill,6.080011,Kill Jill,0,9,6,"Action, Crime",140,0,60,...,0,0,0,0,0,0,0,1,0,1
0,Cakes on a plane,5.611571,Cakes on a plane,0,16,8,"Action, Thriller, Crime",120,0,30,...,0,0,0,0,0,0,0,1,1,1
8,10 Things I Sort Of Don't Like About You,5.584093,10 Things I Sort Of Don't Like About You,0,40,9,"Comedy, Romance, Drama",110,0,15,...,0,0,0,0,0,0,0,0,0,0
1,My couch pulls out but Jay Cutler doesn't,5.490755,My couch pulls out but Jay Cutler doesn't,0,41,1,"Romance, Drama, Biography",90,0,5,...,0,0,1,0,0,0,0,0,0,0
