# Engineering Features

Now that I know how badly my model is scoring, I am going to try and utilize some additional features in order to hopefully feed in better data for my model to train on. 

In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from rake_nltk import Rake
import datetime, time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

%matplotlib inline

In [2]:
with open('./Assets_&_Data/series_df_full.pickle', 'rb') as f:
    series_df = pickle.load(f)
    
with open('./Assets_&_Data/model_prelim.pickle', 'rb') as f:
    model_df = pickle.load(f)
    
with open('./Assets_&_Data/week_day.pickle', 'rb') as f:
    week_day = pickle.load(f)
    
with open('./Assets_&_Data/cleaned_series_df.pickle', 'rb') as f:
    clean_series_df = pickle.load(f)

# Categorizing Seasons

After loading in the dataframe from my previous notebook, the first thing I want to try and do is to reduce the number of dummied months by sorting them into seasons instead. 

In [3]:
week_day['spring'] = (week_day['month'] < 7) & (week_day['month'] > 2)

In [4]:
week_day['summer'] = (week_day['month'] < 9) & (week_day['month'] > 5)

In [5]:
week_day['fall'] = (week_day['month'] < 12) & (week_day['month'] > 8) 

In [6]:
week_day['winter'] = (week_day['month'] < 3) | (week_day['month'] == 12)

In [7]:
week_day_ = week_day.drop('released', axis=1)
week_day_dum1 = pd.get_dummies(week_day_['weekday'], prefix='day')
week_day_dum2 = pd.get_dummies(week_day_['month'], prefix='month')
week_day_dum3 = week_day_[['spring', 'summer', 'fall', 'winter']]

In [8]:
clean_series_df['network'].isnull().sum()

0

In [9]:
network_dummies = pd.get_dummies(clean_series_df['network'], prefix='network')

In [10]:
for row in range(0, len(clean_series_df['network'])):
    if clean_series_df['network'][row] == '':
        clean_series_df['network'][row] = 'None'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


# Making an Actor Dataframe

I want to try and imbue some weight to the actors column so that I don't have to be dummying those out. It seems reasonable that the starring actors should have some sort of effect on the success of a show, whether due to popularity or quality of acting. 

I will start by making a temporary dataframe to CountVectorize, then pass it back through the main Dataframe.

# Topic Modeling

In [11]:
nlp_df = clean_series_df[['writer', 'overview_x', 'number_of_episodes', 'number_of_seasons', 
                     'overview_y', 'status_y', 'actors', 'awards', 'genre_y', 'imdb_rating',
                     'imdb_votes', 'plot', 'runtime_x', 'runtime_cat', 'network']]

In [12]:
nlp_df[['runtime_x', 'awards']] = nlp_df[['runtime_x', 'awards']].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


I want to make sure I don't have any null values and verify the data type.

In [13]:
nlp_df['overview_x'].fillna('N/A', inplace=True)
nlp_df['overview_y'].fillna('N/A', inplace=True)
nlp_df['plot'].fillna('N/A', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [14]:
nlp_df['plot'] = nlp_df['plot'].astype(str)
nlp_df['overview_x'] = nlp_df['overview_x'].astype(str)
nlp_df['overview_y'] = nlp_df['overview_y'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [15]:
nlp_df['bag_of_words'] = nlp_df[['overview_x', 'plot','overview_y']].apply(lambda x: ''.join(x), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [16]:
nlp_df.drop(['overview_x', 'overview_y', 'plot'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [17]:
nlp_df.columns

Index(['writer', 'number_of_episodes', 'number_of_seasons', 'status_y',
       'actors', 'awards', 'genre_y', 'imdb_rating', 'imdb_votes', 'runtime_x',
       'runtime_cat', 'network', 'bag_of_words'],
      dtype='object')

In [18]:
test_df = nlp_df[['actors']]

In [19]:
test_df['imdb_votes'] = nlp_df[['imdb_votes']]
test_df['imdb_rating'] = nlp_df[['imdb_rating']]
test_df['number_of_episodes'] = nlp_df[['number_of_episodes']]
test_df['number_of_seasons'] = nlp_df[['number_of_seasons']]
test_df['awards'] = nlp_df[['awards']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [20]:
test_df['actors'] = test_df['actors'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Now that my data is cleaner, I can CountVectorize the actor names. I am using CountVectorizier over TF-IDF because this dataset/corpus is relatively small and I will want all of the actors to be seen; if I were to TF-IDF these names, I may just receive a shorter list of the strongest features.

For the sake of avoiding actors with the same first or last names, I will be combining their first and last name along with removing punctuation marks.

In [21]:
cv = CountVectorizer(stop_words=None, analyzer='word', 
                     ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None)

# List of strings
name_list = []

for row in test_df['actors']:
    name_list.append(row)

In [22]:
name_list_clean = []
for item in range(0, len(name_list)):
    name_list_clean.append(name_list[item].lower().replace("'", "").replace(' ', '').replace('.', '').replace('-', ''))


In [23]:
# Categorizing PM slots
for i in range(1, 13):
    if i < 6:
        pm_slot[pm_slot['airsTime'].str.contains(str(i))] = 'afternoon'
        pm_slot_[pm_slot_['airsTime'].str.contains(str(i))] = 'afternoon'
    elif i >= 6 & i < 10:
        pm_slot[pm_slot['airsTime'].str.contains(str(i))] = 'evening'
        pm_slot_[pm_slot_['airsTime'].str.contains(str(i))] = 'evening'
    else: 
        pm_slot[pm_slot['airsTime'].str.contains(str(i))] = 'latenight'
        pm_slot_[pm_slot_['airsTime'].str.contains(str(i))] = 'latenight'

#pm_slot[pm_slot['airsTime'].str.contains('8', '9')]

NameError: name 'pm_slot' is not defined

In [None]:
# Categorizing PM slots
for i in range(1, 13):
    if i < 5:
        am_slot[am_slot['airsTime'].str.contains(str(i))] = 'latenight'
        am_slot_[am_slot_['airsTime'].str.contains(str(i))] = 'latenight'
    else: 
        am_slot[am_slot['airsTime'].str.contains(str(i))] = 'morning'
        am_slot_[am_slot_['airsTime'].str.contains(str(i))] = 'morning'

#pm_slot[pm_slot['airsTime'].str.contains('8', '9')]

In [None]:
missing_time['airsTime'] = 'unknown'

In [None]:
timeslot = pd.concat((am_slot, am_slot_, pm_slot, pm_slot_, remaining_time, missing_time))

In [None]:
week_day = series_df[['released']]

In [None]:
week_day.head()

In [None]:
week_day['released'] = pd.to_datetime(week_day['released'])

In [None]:
week_day['weekday'] = week_day['released'].dt.dayofweek

In [None]:
week_day.head(10)

In [None]:
week_day['month'] = week_day['released'].dt.month

In [None]:
week_day.head()

# Count Vectorizing the actors

In [None]:
count_train = cv.fit(name_list_clean)
bag_of_words = cv.transform(name_list_clean)

In [None]:
match = cv.vocabulary_

I now have a list of vectorized values for each actor, and will want to iterate through the test_df & multiply the imdb rating by this value (might want to scale it down?), then remove the actual actors themselves

In [None]:
#actors_split = pd.concat([test_df['actors'].str.split(', ', expand=True)], axis=1)
#actors_split

In [None]:
actors_split = pd.concat([test_df['actors'].str.split(', ', expand=True)], axis=1)
test_df = pd.concat((test_df ,actors_split), axis=1)

In [None]:
test_df[1] = test_df[1].fillna("none")
test_df[2] = test_df[2].fillna("none")
test_df[3] = test_df[3].fillna("none")

In [None]:
for number in range(0, len(test_df[0])):
    test_df['actor_1'] = test_df[0][number].lower().replace("'", "").replace(' ', '').replace('.', '').replace('-', '')

for number in range(0, len(test_df[1])):
    test_df['actor_2'] = test_df[1][number].lower().replace("'", "").replace(' ', '').replace('.', '').replace('-', '')

for number in range(0, len(test_df[2])):
    test_df['actor_3'] = test_df[2][number].lower().replace("'", "").replace(' ', '').replace('.', '').replace('-', '')

for number in range(0, len(test_df[3])):
    test_df['actor_4'] = test_df[3][number].lower().replace("'", "").replace(' ', '').replace('.', '').replace('-', '')

In [None]:
for number in range(0, len(test_df[0])):
    test_df['actor_1'][number] = test_df[0][number].lower().replace("'", "").replace(' ', '').replace('.', '').replace('-', '')

for number in range(0, len(test_df[1])):
    test_df['actor_2'][number] = test_df[1][number].lower().replace("'", "").replace(' ', '').replace('.', '').replace('-', '')

for number in range(0, len(test_df[2])):
    test_df['actor_3'][number] = test_df[2][number].lower().replace("'", "").replace(' ', '').replace('.', '').replace('-', '')

for number in range(0, len(test_df[3])):
    test_df['actor_4'][number] = test_df[3][number].lower().replace("'", "").replace(' ', '').replace('.', '').replace('-', '')

In [None]:
#df[2] = df[0].map(match).fillna(df[2])
test_df['actor_1'] = test_df['actor_1'].map(match).fillna(test_df['actor_1'])
test_df['actor_2'] = test_df['actor_2'].map(match).fillna(test_df['actor_2'])
test_df['actor_3'] = test_df['actor_3'].map(match).fillna(test_df['actor_3'])
test_df['actor_4'] = test_df['actor_4'].map(match).fillna(test_df['actor_4'])

# This may allow me to replace the values in the cell with the count vectorized values, 
# but I'll need to have the words match first.

In [None]:
temp_df = test_df.drop(['actors', 0, 1, 2, 3], axis=1)

In [None]:
def remove_nones(df):
    for i in range(0, len(df)):
        if df[i] == 'none':
            df[i] = 0

In [None]:
remove_nones(temp_df['actor_2'])
remove_nones(temp_df['actor_3'])
remove_nones(temp_df['actor_4'])

In [None]:
temp_df.tail()

In [None]:
temp_df['actor_1'] = temp_df['actor_1'].astype(int)
temp_df['actor_2'] = temp_df['actor_2'].astype(int)
temp_df['actor_3'] = temp_df['actor_3'].astype(int)
temp_df['actor_4'] = temp_df['actor_4'].astype(int)
temp_df['imdb_rating'] = temp_df['imdb_rating'].astype(float)

In [None]:
temp_df['actor_1_weighted'] = temp_df['imdb_rating'] * temp_df['actor_1']
temp_df['actor_2_weighted'] = temp_df['imdb_rating'] * temp_df['actor_2']
temp_df['actor_3_weighted'] = temp_df['imdb_rating'] * temp_df['actor_3']
temp_df['actor_4_weighted'] = temp_df['imdb_rating'] * temp_df['actor_4']
temp_df

In [None]:
temp_df['actors_cum_sum'] = temp_df['actor_1_weighted'] + temp_df['actor_2_weighted'] + temp_df['actor_3_weighted'] + temp_df['actor_4_weighted'] 

In [None]:
temp_df

In [None]:
temp_df.isnull().sum()

In [None]:
temp_df.dropna(inplace=True)

# Use this space to assign weight to genre the same way that the weights were given to actors

## Possibly do the same for networks?

In [None]:
#genre_df

genre_df.drop(['writer', 'number_of_episodes', 'number_of_seasons', 'status_y',
       'actors', 'awards', 'genre_y', 'imdb_rating', 'imdb_votes', 'runtime_x',
       'runtime_cat', 'network', 'bag_of_words'], axis=1, inplace=True)

In [None]:
nlp_df.columns

In [None]:
genre_df = nlp_df[['genre_y']]

In [None]:
genre_df['imdb_rating'] = nlp_df[['imdb_rating']]

In [None]:
genre_df['genre_y'] = genre_df['genre_y'].astype(str)

In [None]:
cv = CountVectorizer(stop_words=None, analyzer='word', 
                     ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None)

# List of strings
genre_list = []

for row in genre_df['genre_y']:
    genre_list.append(row)

In [None]:
genre_list_clean = []
for item in range(0, len(genre_list)):
    genre_list_clean.append(genre_list[item].lower().replace("'", "").replace(' ', '').replace('.', '').replace('-', ''))


In [None]:
genre_count = cv.fit(genre_list_clean)
genre_bag_words = cv.transform(genre_list_clean)

In [None]:
genre_match = cv.vocabulary_

In [None]:
genre_split = pd.concat([genre_df['genre_y'].str.split(', ', expand=True)], axis=1)
genre_df = pd.concat((genre_df, genre_split), axis=1)

In [None]:
genre_df

In [None]:
genre_df[1] = genre_df[1].fillna("none")
genre_df[2] = genre_df[2].fillna("none")
genre_df[3] = genre_df[3].fillna("none")
genre_df[4] = genre_df[4].fillna("none")
genre_df[5] = genre_df[5].fillna("none")

In [None]:
genre_df.head()

In [None]:
for number in range(0, len(genre_df)):
    genre_df['genre_1'] = genre_df[0][number].lower().replace("'", "").replace(' ', '').replace('.', '').replace('-', '')

for number in range(0, len(genre_df)):
    genre_df['genre_2'] = genre_df[1][number].lower().replace("'", "").replace(' ', '').replace('.', '').replace('-', '')

for number in range(0, len(genre_df)):
    genre_df['genre_3'] = genre_df[2][number].lower().replace("'", "").replace(' ', '').replace('.', '').replace('-', '')

for number in range(0, len(genre_df)):
    genre_df['genre_4'] = genre_df[3][number].lower().replace("'", "").replace(' ', '').replace('.', '').replace('-', '')

for number in range(0, len(genre_df)):
    genre_df['genre_5'] = genre_df[4][number].lower().replace("'", "").replace(' ', '').replace('.', '').replace('-', '')

for number in range(0, len(genre_df)):
    genre_df['genre_6'] = genre_df[5][number].lower().replace("'", "").replace(' ', '').replace('.', '').replace('-', '')



In [None]:
for number in range(0, len(genre_df)):
    genre_df['genre_1'][number] = genre_df[0][number].lower().replace("'", "").replace(' ', '').replace('.', '').replace('-', '')

for number in range(0, len(genre_df)):
    genre_df['genre_2'][number] = genre_df[1][number].lower().replace("'", "").replace(' ', '').replace('.', '').replace('-', '')

for number in range(0, len(genre_df)):
    genre_df['genre_3'][number] = genre_df[2][number].lower().replace("'", "").replace(' ', '').replace('.', '').replace('-', '')

for number in range(0, len(genre_df)):
    genre_df['genre_4'][number] = genre_df[3][number].lower().replace("'", "").replace(' ', '').replace('.', '').replace('-', '')

for number in range(0, len(genre_df)):
    genre_df['genre_5'][number] = genre_df[4][number].lower().replace("'", "").replace(' ', '').replace('.', '').replace('-', '')

for number in range(0, len(genre_df)):
    genre_df['genre_6'][number] = genre_df[5][number].lower().replace("'", "").replace(' ', '').replace('.', '').replace('-', '')



In [None]:
genre_df['genre_1'] = genre_df['genre_1'].map(genre_match).fillna(genre_df['genre_1'])
genre_df['genre_2'] = genre_df['genre_2'].map(genre_match).fillna(genre_df['genre_2'])
genre_df['genre_3'] = genre_df['genre_3'].map(genre_match).fillna(genre_df['genre_3'])
genre_df['genre_4'] = genre_df['genre_4'].map(genre_match).fillna(genre_df['genre_4'])
genre_df['genre_5'] = genre_df['genre_5'].map(genre_match).fillna(genre_df['genre_5'])
genre_df['genre_6'] = genre_df['genre_6'].map(genre_match).fillna(genre_df['genre_6'])

In [None]:
genre_df = genre_df.drop(['genre_y', 0, 1, 2, 3, 4, 5], axis=1)

In [None]:
genre_df.head(3)

In [None]:
remove_nones(genre_df['genre_2'])
remove_nones(genre_df['genre_3'])
remove_nones(genre_df['genre_4'])
remove_nones(genre_df['genre_5'])
remove_nones(genre_df['genre_6'])

In [None]:
genre_df['genre_1'] = genre_df['genre_1'].astype(int)
genre_df['genre_2'] = genre_df['genre_2'].astype(int)
genre_df['genre_3'] = genre_df['genre_3'].astype(int)
genre_df['genre_4'] = genre_df['genre_4'].astype(int)
genre_df['genre_5'] = genre_df['genre_5'].astype(int)
genre_df['genre_6'] = genre_df['genre_6'].astype(int)
genre_df['imdb_rating'] = genre_df['imdb_rating'].astype(float)

In [None]:
genre_df['genre_1_weighted'] = genre_df['imdb_rating'] * genre_df['genre_1']
genre_df['genre_2_weighted'] = genre_df['imdb_rating'] * genre_df['genre_2']
genre_df['genre_3_weighted'] = genre_df['imdb_rating'] * genre_df['genre_3']
genre_df['genre_4_weighted'] = genre_df['imdb_rating'] * genre_df['genre_4']
genre_df['genre_5_weighted'] = genre_df['imdb_rating'] * genre_df['genre_5']
genre_df['genre_6_weighted'] = genre_df['imdb_rating'] * genre_df['genre_6']

genre_df.head()

In [None]:
genre_df_weighted = genre_df.drop(['genre_1', 'genre_2', 'genre_3', 'genre_4', 'genre_5', 'genre_6'], axis=1)

In [None]:
with open('./Assets_&_Data/model_prelim.pickle', 'rb') as handle:
    model_df = pickle.load(handle)

In [None]:
X = genre_df_weighted.drop('imdb_rating', axis=1)
y = genre_df_weighted['imdb_rating']

In [None]:
X = model_df[['Action', ' Adventure', ' Animation', ' Comedy', ' Crime',
       ' Drama', ' Family', ' Fantasy', ' Game-Show', ' History', ' Horror',
       ' Music', ' Musical', ' Mystery', ' News', ' Reality-TV', ' Romance',
       ' Sci-Fi', ' Short', ' Sport', ' Talk-Show', ' Thriller', ' War',
       ' Western', 'Action', 'Adventure', 'Animation', 'Biography', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Game-Show',
       'History', 'Horror', 'Music', 'Mystery', 'News', 'Reality-TV',
       'Romance', 'Sci-Fi', 'Sport', 'Talk-Show', 'Western']]
y = model_df['imdb_rating']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [None]:
lr = LinearRegression()
lr.fit(X_train_sc, y_train)
lr.score(X_train_sc, y_train), lr.score(X_test_sc, y_test)

In [None]:
model_df.columns

# Testing out the weights

In [None]:
temp_df.columns

In [None]:
X = temp_df.drop(['imdb_rating', 'actor_1', 'actor_2', 'actor_3', 'actor_4'], axis=1)
y = temp_df['imdb_rating']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
lr = LinearRegression()

In [None]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [None]:
lr.fit(X_train_sc, y_train)

lr.score(X_train_sc, y_train)

In [None]:
lr.score(X_test_sc, y_test)

In [None]:
nlp_df['raked_words'] = ""

for index, row in nlp_df.iterrows():
    plot = row['bag_of_words']
    
    r = Rake()
    r.extract_keywords_from_text(plot)

    key_words_dict_scores = r.get_word_degrees()
    row['raked_words'] = list(key_words_dict_scores)

# dropping the Plot column
#df.drop(columns = ['Plot'], inplace = True)

In [None]:
r = Rake()
r.extract_keywords_from_text(nlp_df['bag_of_words'][10])

In [None]:
test = r.get_ranked_phrases_with_scores()
test

In [None]:
good_model = model_df.merge(temp_df, left_index=True, right_index=True)

In [None]:
#good_model = good_model.merge(network_dummies, left_index=True, right_index=True)

In [None]:
good_model.columns

In [None]:
X.columns

In [None]:
X = good_model[['imdb_votes_x', 'awards_x', 'number_of_seasons_x', 'day_0', 'day_1', 'day_2', 'day_3', 'day_4',
       'day_5', 'day_6', ' Action', ' Adventure', ' Animation', ' Comedy', ' Crime',
       ' Drama', ' Family', ' Fantasy', ' Game-Show', ' History', ' Horror',
       ' Music', ' Musical', ' Mystery', ' News', ' Reality-TV', ' Romance',
       ' Sci-Fi', ' Short', ' Sport', ' Talk-Show', ' Thriller', ' War',
       ' Western', 'Action', 'Adventure', 'Animation', 'Biography', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Game-Show',
       'History', 'Horror', 'Music', 'Mystery', 'News', 'Reality-TV',
       'Romance', 'Sci-Fi', 'Sport', 'Talk-Show', 'Western', 'actor_1_weighted', 'actor_2_weighted', 'actor_3_weighted',
       'actor_4_weighted']]
y = good_model['imdb_rating_x']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=24, test_size=0.1)
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [None]:
lr = LinearRegression()
lr.fit(X_train_sc, y_train)

In [None]:
lr.score(X_train_sc, y_train), lr.score(X_test_sc, y_test)

In [None]:
rf = RandomForestRegressor()
rf.fit(X_train_sc, y_train)
rf.score(X_train_sc, y_train), rf.score(X_test_sc, y_test)

In [None]:
gs = GradientBoostingRegressor()
gs.fit(X_train_sc, y_train)
gs.score(X_train_sc, y_train), gs.score(X_test_sc, y_test)

# Next is genre bag-of-words for topic modeling

# Writer bag-of-words?

# Setting up Grid Search(es)

1. Instantiate the models being used in the pipeline along with the static parameters
2. Create parameter dicts for each of the models' hyperparameters
3. Instantiate GridSearchCV with the above pipeline and params

In [None]:
lr_gs = GridSearchCV(LinearRegression(), 
                         {'n_jobs': [1, 5, 10]}, 
                         n_jobs=10,
                         scoring='neg_mean_absolute_error')

In [None]:
lr_gs.fit(X_train_sc, y_train)

In [None]:
lr_gs.score(X_train_sc, y_train), lr_gs.score(X_test_sc, y_test)

In [None]:
lasso_gs = GridSearchCV(LassoCV(),
                        {'n_alphas': [100, 250],
                        'tol': [.0001, .001, .01]},
                        scoring='neg_mean_absolute_error'
)

In [None]:
lasso_gs.fit(X_train_sc, y_train)
lasso_gs.score(X_train_sc, y_train), lasso_gs.score(X_test_sc, y_test)

In [None]:
ridge_gs = GridSearchCV(RidgeCV(),
                        {'alphas': [(0.1, 1.0, 10.0), (0.01, 1.0, 10.0)]},
                        scoring='neg_mean_absolute_error'
)

In [None]:
ridge_gs.fit(X_train_sc, y_train)
ridge_gs.score(X_train_sc, y_train), ridge_gs.score(X_test_sc, y_test)

In [None]:
en_gs = GridSearchCV(ElasticNetCV(),
                        {'l1_ratio': [0.25, 0.5, 0.75],
                         'n_alphas': [100, 250],
                         'tol': [0.0001, 0.001, 0.01]},
                     scoring='neg_mean_absolute_error'
)

In [None]:
en_gs.fit(X_train_sc, y_train)
en_gs.score(X_train_sc, y_train), en_gs.score(X_test_sc, y_test)

In [None]:
rf_gs = GridSearchCV(RandomForestRegressor(), 
                         {'n_estimators': [10, 100, 200],
                         'max_depth': [5, 25, 50],
                         'min_samples_split': [2, 5, 10],
                         'n_jobs': [1, 5, 10]},
                     scoring='neg_mean_absolute_error'
)

In [None]:
rf_gs.fit(X_train_sc, y_train)
rf_gs.score(X_train_sc, y_train), rf_gs.score(X_test_sc, y_test)

In [None]:
gb_gs = GridSearchCV(GradientBoostingRegressor(),
                        {'learning_rate': [0.01, 0.1, 0.15],
                        'n_estimators': [100, 250],
                        'min_samples_split': [2, 4, 6],
                        'max_depth': [3, 5, 10],
                        'max_features': [0.5, 0.75, None],
                        'alpha': [0.9, 0.95]},
                     scoring='neg_mean_absolute_error'
)

In [None]:
gb_gs.fit(X_train_sc, y_train)
gb_gs.score(X_train_sc, y_train), gb_gs.score(X_test_sc, y_test)

In [None]:
gs.fit(X_train_sc, y_train)
gs.score(X_train_sc, y_train), gs.score(X_test_sc, y_test)

gb_gs.fit(X_train_sc, y_train)
gb_gs.score(X_train_sc, y_train), gb_gs.score(X_test_sc, y_test)

In [None]:
plt.hist(good_model['imdb_rating_x'])

In [None]:
plt.figure(figsize=(10,10))
plt.hist(good_model['imdb_votes_y'])

In [None]:
good_model.imdb_votes_x.sort_values(ascending=False)

In [None]:
plt.scatter(good_model['imdb_rating_x'], good_model['imdb_votes_x'])

In [None]:
series_df.columns

In [None]:
plt.scatter(series_df['imdb_rating'], series_df['number_of_seasons'])

In [None]:
plt.scatter(series_df['imdb_rating'], series_df['number_of_episodes'])

In [None]:
final_shows = list(model_df.index)

In [None]:
with open('./Assets_&_Data/final_show_list.pickle', 'wb') as f:
    pickle.dump(final_shows, f)