In [1]:
# Initial Imports
from sqlalchemy import create_engine
import pandas as pd
import numpy as np


In [2]:
engine = create_engine('postgresql+psycopg2://postgres:moviesondemand@moviesondemandaws.cfwjiare7kds.us-east-2.rds.amazonaws.com:5432/postgres')

In [3]:
main_df = pd.read_sql_table('consolidated_pre_transformation', con=engine)

In [4]:
main_df.head()

Unnamed: 0,title_length,month_number,genre,duration,imdb_rating,budget,oscar_nominated_actors_count,is_oscar_directed
0,14,3,"Comedy, Fantasy, Romance",118,6.4,48000000,0,0
1,18,6,"Action, Adventure, Fantasy",110,5.5,25000000,0,0
2,10,9,"Action, Drama, Thriller",110,5.8,25000000,1,0
3,3,4,"Comedy, Drama, Horror",93,6.6,500000,0,0
4,15,4,"Comedy, Horror",94,4.8,900000,0,0


In [5]:
#Genre List Data Frame
main_df['Genre List'] = main_df['genre'].apply(lambda x: x.split(', ') if type(x)==str else [])
main_df['Genre List']

0           [Comedy, Fantasy, Romance]
1         [Action, Adventure, Fantasy]
2            [Action, Drama, Thriller]
3              [Comedy, Drama, Horror]
4                     [Comedy, Horror]
                     ...              
2557                  [Drama, Romance]
2558       [Horror, Mystery, Thriller]
2559                          [Horror]
2560       [Horror, Mystery, Thriller]
2561    [Animation, Adventure, Comedy]
Name: Genre List, Length: 2562, dtype: object

In [6]:
#loop list of Genres
genres = []
for i in main_df['Genre List']:
    genres.extend(i)


In [7]:
# Genre Counter
from collections import Counter

Counter(genres).most_common()

[('Drama', 1186),
 ('Comedy', 1050),
 ('Action', 755),
 ('Adventure', 566),
 ('Crime', 502),
 ('Thriller', 462),
 ('Romance', 415),
 ('Horror', 313),
 ('Mystery', 279),
 ('Fantasy', 240),
 ('Sci-Fi', 221),
 ('Animation', 168),
 ('Family', 167),
 ('Biography', 157),
 ('Sport', 68),
 ('Music', 66),
 ('History', 50),
 ('War', 25),
 ('Musical', 16),
 ('Western', 10)]

In [8]:
# Genre Consolidation 
genres_no_repeats = [i[0] for i in Counter(genres).most_common()]
genres_no_repeats

['Drama',
 'Comedy',
 'Action',
 'Adventure',
 'Crime',
 'Thriller',
 'Romance',
 'Horror',
 'Mystery',
 'Fantasy',
 'Sci-Fi',
 'Animation',
 'Family',
 'Biography',
 'Sport',
 'Music',
 'History',
 'War',
 'Musical',
 'Western']

In [9]:
main_df.head()

Unnamed: 0,title_length,month_number,genre,duration,imdb_rating,budget,oscar_nominated_actors_count,is_oscar_directed,Genre List
0,14,3,"Comedy, Fantasy, Romance",118,6.4,48000000,0,0,"[Comedy, Fantasy, Romance]"
1,18,6,"Action, Adventure, Fantasy",110,5.5,25000000,0,0,"[Action, Adventure, Fantasy]"
2,10,9,"Action, Drama, Thriller",110,5.8,25000000,1,0,"[Action, Drama, Thriller]"
3,3,4,"Comedy, Drama, Horror",93,6.6,500000,0,0,"[Comedy, Drama, Horror]"
4,15,4,"Comedy, Horror",94,4.8,900000,0,0,"[Comedy, Horror]"


In [10]:
main_df['isOther'] = main_df['genre'].apply(lambda x: 1 if 'Romance' in x or 'Fantasy' in x or 'Western' in x or 'War' in x or 'History' in x or 'Sport' in x or 'Music' in x or 'Musical' in x or 'Animation' in x or 'Biography' in x or 'Family' in x or 'Sci-Fi' in x else 0)

main_df['isMystery'] = main_df['genre'].apply(lambda x: 1 if 'Mystery' in x else 0)
main_df['isHorror'] = main_df['genre'].apply(lambda x: 1 if 'Horror' in x else 0)
main_df['isAdventure'] = main_df['genre'].apply(lambda x: 1 if 'Adventure' in x else 0)
main_df['isCrime'] = main_df['genre'].apply(lambda x: 1 if 'Crime' in x else 0)
main_df['isThriller'] = main_df['genre'].apply(lambda x: 1 if 'Thriller' in x else 0)
main_df['isAction'] = main_df['genre'].apply(lambda x: 1 if 'Action' in x else 0)
main_df['isComedy'] = main_df['genre'].apply(lambda x: 1 if 'Comedy' in x else 0)
main_df['isDrama'] = main_df['genre'].apply(lambda x: 1 if 'Drama' in x else 0)

In [11]:
stripped_main_df = main_df.drop(['genre', 'Genre List'], axis=1)

In [12]:
clean_main_df = stripped_main_df.dropna()

In [13]:
# save final clean machine learning dataset off to resources folder
resources_folder = r'Resources/'
clean_main_df.to_csv(resources_folder + 'clean_ml_data.csv', index=False)

In [14]:
import pandas as pd
from sklearn import linear_model
import statsmodels.api as sm

#Create X, Y
X = clean_main_df.drop(['imdb_rating'], axis=1)
y = clean_main_df['imdb_rating']

#Split and Test Data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state = 1)


In [15]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression() #create an instance of Linear Regression
lr.fit(X_train,y_train)

LinearRegression()

In [16]:
y_test_predict = lr.predict(X_test) #y_test prediction values
y_test_predict

array([6.56319596, 6.15556957, 5.68149759, 6.0748428 , 6.53550421,
       6.20625342, 6.26153088, 6.16498987, 6.42064882, 5.81661872,
       6.18523089, 6.32916482, 5.31387308, 5.92811875, 5.63784432,
       6.53132247, 6.42918244, 5.90892135, 6.9231064 , 5.72566092,
       5.88016771, 6.04509844, 5.51158714, 5.80815946, 5.73392885,
       5.91882555, 5.4439912 , 6.15544459, 7.27005859, 6.17019083,
       7.20452325, 5.73903526, 6.64570056, 5.67225563, 6.79329109,
       7.12953934, 6.10254539, 6.61192336, 6.23202227, 6.19603977,
       6.68483937, 6.79365435, 6.7023154 , 6.7893897 , 5.70886518,
       5.98442013, 6.8170489 , 6.05038626, 5.73697   , 5.34803026,
       6.2580242 , 5.64091989, 6.68432607, 5.96512939, 6.58220215,
       6.23556433, 6.76227617, 6.72528482, 5.83627711, 6.63847325,
       5.98641033, 6.10274445, 5.8730589 , 6.80703015, 5.80941935,
       6.57804309, 6.06642491, 5.85032313, 7.28052612, 6.9423201 ,
       5.93941652, 6.36299461, 6.85250062, 5.84769821, 6.91133

In [17]:
y_test

893     7.6
586     7.6
705     5.9
2493    6.0
366     6.2
       ... 
1204    6.4
1406    4.4
1164    5.8
1080    5.8
1791    5.6
Name: imdb_rating, Length: 641, dtype: float64

In [18]:
from sklearn.metrics import mean_squared_error, r2_score
mean_squared_error(y_test, y_test_predict) #computes the difference between correct answer and predicted answer

0.7068132781006802

In [19]:
#Evaluate performance using R Squared Score
r2_score(y_test, y_test_predict)

0.2755494404547082

In [20]:
#Mean Absolute Error (MAE)
def mae(y_true, y_pred):
    return np.mean(np.abs(y_pred - y_true)) 

print('MAE: {:.4f}'.format(mae(y_test, y_test_predict)))
# And the MAE isn't terrible either

MAE: 0.6375
