In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [58]:
# Importing Data
%store -r imdb
%store -r float_imdb

In [3]:
# 81,273 by 70 
imdb.shape

(81273, 70)

In [350]:
# imdb.avg_vote is our target variable 

In [3]:
# Removing columns that are too cumbersome to clean
# income and budget columns were all in different denominations 
obj_columns = imdb.select_dtypes(['object'])
obj_columns = obj_columns.drop(['worlwide_gross_income', 'budget', 'usa_gross_income', 
                               'description', 'original_title', 'title', 'date_published'], axis=1)
obj_columns.head(2)

Unnamed: 0,imdb_title_id,genre,country,language,director,writer,production_company,actors
0,tt0000574,"Biography, Crime, Drama",Australia,0,Charles Tait,Charles Tait,J. and N. Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be..."
1,tt0001892,Drama,"Germany, Denmark",0,Urban Gad,"Urban Gad, Gebhard Schätzler-Perasini",Fotorama,"Asta Nielsen, Valdemar Psilander, Gunnar Helse..."


In [4]:
# Cleaning columns to be used to find top 50 or 10%
imdb.actors = imdb.actors.str.strip()
imdb.director = imdb.director.str.strip()
imdb.genre = imdb.genre.str.strip()
imdb.country = imdb.country.str.strip()
imdb.language = imdb.language.str.strip()
imdb.writer = imdb.writer.str.strip()
imdb.production_company = imdb.production_company.str.strip()

imdb.actors = imdb.actors.replace(np.nan, 0)
imdb.director = imdb.director.replace(np.nan, 0)
imdb.genre = imdb.genre.replace(np.nan, 0)
imdb.country = imdb.country.replace(np.nan, 0)
imdb.language = imdb.language.replace(np.nan, 0)
imdb.writer = imdb.writer.replace(np.nan, 0)
imdb.production_company = imdb.production_company.replace(np.nan, 0)

In [5]:
# Actors
cleaned_actors = pd.DataFrame(obj_columns.set_index('imdb_title_id').actors.str.split(',', expand=True).stack())
cleaned_actors = cleaned_actors.reset_index()
cleaned_actors.columns = ['imdb_title_id','order','actors']
cleaned_actors = cleaned_actors.drop(['order'], axis=1)
cleaned_actors = cleaned_actors.actors.str.strip()
cleaned_actors.head()

0     Elizabeth Tait
1          John Tait
2    Norman Campbell
3         Bella Cola
4         Will Coyne
Name: actors, dtype: object

In [6]:
cleaned_actors_count = pd.Series(cleaned_actors.value_counts())
cleaned_actors_count.head()

Brahmanandam    303
Anupam Kher     211
Nassar          196
Eric Roberts    187
Prakash Raj     180
Name: actors, dtype: int64

In [7]:
# Actors column is too large to take top 10% so we go with top 60
threshold = cleaned_actors_count >= 100
threshold.value_counts()

False    396093
True         60
Name: actors, dtype: int64

In [8]:
top_actors = pd.DataFrame(cleaned_actors_count.head(60))
top_actors = top_actors.reset_index()
top_actors.columns = ['actors','num_app']
top_actors = top_actors.drop(['num_app'], axis=1)
top_actors = top_actors['actors'].str.strip().tolist()
top_actors[:5]

['Brahmanandam', 'Anupam Kher', 'Nassar', 'Eric Roberts', 'Prakash Raj']

In [9]:
# Define function for finding top actors

def in_top_actors(actors):
    if type(actors) == int:
        actors = []
    else: 
        actors = actors.split(',')
    return [1 if i in actors else 0 for i in top_actors]

In [10]:
# Applying function to create the top actors matrix
obj_columns['top_actors'] = obj_columns['actors'].map(in_top_actors)

In [11]:
# Converting the matrix to a Dataframe array
top_actors_array = pd.DataFrame(np.array(obj_columns['top_actors'].tolist()), columns=top_actors)
top_actors_array.head()

Unnamed: 0,Brahmanandam,Anupam Kher,Nassar,Eric Roberts,Prakash Raj,Amitabh Bachchan,Gérard Depardieu,Mohanlal,Tanikella Bharani,John Carradine,...,Jackie Shroff,Malcolm McDowell,Jackie Chan,Reginald Owen,Joy Badlani,Danny Glover,Ajay Devgn,Suet Lam,John Hurt,Sanjay Dutt
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Directors
cleaned_director = pd.DataFrame(obj_columns.set_index('imdb_title_id').director.str.split(',', expand=True).stack())
cleaned_director = cleaned_director.reset_index()
cleaned_director.columns = ['imdb_title_id','order','director']
cleaned_director = cleaned_director.drop(['order'], axis=1)
cleaned_director = cleaned_director.director.str.strip()
cleaned_director.head()

0           Charles Tait
1              Urban Gad
2     Charles L. Gaskill
3    Francesco Bertolini
4         Adolfo Padovan
Name: director, dtype: object

In [13]:
cleaned_director_count = pd.Series(cleaned_director.value_counts())
cleaned_director_count.head()

Michael Curtiz    99
Jesús Franco      92
Lloyd Bacon       78
John Ford         76
Richard Thorpe    74
Name: director, dtype: int64

In [14]:
# Directors column is too large to take top 10% so we go with top 1%
threshold_d = cleaned_director_count >= 50
threshold_d.value_counts()

False    32503
True        33
Name: director, dtype: int64

In [15]:
top_directors = pd.DataFrame(cleaned_director_count.head(33))
top_directors = top_directors.reset_index()
top_directors.columns = ['director','num_app']
top_directors = top_directors.drop(['num_app'], axis=1)
top_directors = top_directors['director'].str.strip().tolist()
top_directors[:5]

['Michael Curtiz',
 'Jesús Franco',
 'Lloyd Bacon',
 'John Ford',
 'Richard Thorpe']

In [16]:
# Define function for finding top directors

def in_top_director(director):
    if type(director) == int:
        director = []
    else: 
        director = director.split(',')
    return [1 if i in director else 0 for i in top_directors]

In [17]:
# Applying function to create the top directors matrix
obj_columns['top_directors'] = obj_columns['director'].map(in_top_director)

In [18]:
# Converting the matrix to a Dataframe array
top_directors_array = pd.DataFrame(np.array(obj_columns['top_directors'].tolist()), columns=top_directors)
top_directors_array.head()

Unnamed: 0,Michael Curtiz,Jesús Franco,Lloyd Bacon,John Ford,Richard Thorpe,William Beaudine,Jing Wong,Gordon Douglas,Cheh Chang,Raoul Walsh,...,Fred Olen Ray,Ray Enright,Roy Del Ruth,William A. Seiter,Robert Z. Leonard,Carlo Vanzina,Joseph Kane,William Dieterle,Roger Corman,Johnnie To
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# Renaming directors that appeared agian as another column header

top_directors_array = top_directors_array.rename(columns={'Cheh Chang':'Cheh Chang_d', 
                                                          'Giannis Dalianidis':'Giannis Dalianidis_d',
                                                          'Claude Chabrol':'Claude Chabrol_d',
                                                          'Carlo Vanzina':'Carlo Vanzina_d',
                                                          'Jing Wong':'Jing Wong_d',
                                                          'Jesús Franco':'Jesús Franco_d'})

In [20]:
# Genre
cleaned_genre = pd.DataFrame(obj_columns.set_index('imdb_title_id').genre.str.split(',', expand=True).stack())
cleaned_genre = cleaned_genre.reset_index()
cleaned_genre.columns = ['imdb_title_id','order','genre']
cleaned_genre = cleaned_genre.drop(['order'], axis=1)
cleaned_genre = cleaned_genre.genre.str.strip()
cleaned_genre.head()

0    Biography
1        Crime
2        Drama
3        Drama
4        Drama
Name: genre, dtype: object

In [21]:
cleaned_genre_count = pd.Series(cleaned_genre.value_counts())
cleaned_genre_count.head()

Drama       44404
Comedy      27873
Romance     13379
Action      12106
Thriller    10814
Name: genre, dtype: int64

In [22]:
threshold_g = cleaned_genre_count >= 50
threshold_g.value_counts()

True     21
False     5
Name: genre, dtype: int64

In [23]:
top_genres = pd.DataFrame(cleaned_genre_count.head(21))
top_genres = top_genres.reset_index()
top_genres.columns = ['genre','num_app']
top_genres = top_genres.drop(['num_app'], axis=1)
top_genres = top_genres['genre'].str.strip().tolist()
top_genres[:5]

['Drama', 'Comedy', 'Romance', 'Action', 'Thriller']

In [24]:
# Define function for finding top genres

def in_top_genre(genre):
    if type(genre) == int:
        genre = []
    else: 
        genre = genre.split(',')
    return [1 if i in genre else 0 for i in top_genres]

In [25]:
obj_columns['top_genres'] = obj_columns['genre'].map(in_top_genre)

In [26]:
top_genres_array = pd.DataFrame(np.array(obj_columns['top_genres'].tolist()), columns=top_genres)
top_genres_array.head()

Unnamed: 0,Drama,Comedy,Romance,Action,Thriller,Crime,Horror,Adventure,Mystery,Family,...,Sci-Fi,Biography,History,War,Animation,Musical,Western,Music,Sport,Film-Noir
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [27]:
# Country
cleaned_country = pd.DataFrame(obj_columns.set_index('imdb_title_id').country.str.split(',', expand=True).stack())
cleaned_country = cleaned_country.reset_index()
cleaned_country.columns = ['imdb_title_id','order','country']
cleaned_country = cleaned_country.drop(['order'], axis=1)
cleaned_country = cleaned_country.country.str.strip()
cleaned_country.head()

0    Australia
1      Germany
2      Denmark
3          USA
4        Italy
Name: country, dtype: object

In [28]:
cleaned_country_count = pd.Series(cleaned_country.value_counts())
cleaned_country_count.mean()

541.3157894736842

In [29]:
threshold_c = cleaned_country_count >= 980
threshold_c.value_counts()

False    171
True      19
Name: country, dtype: int64

In [30]:
top_countries = pd.DataFrame(cleaned_country_count.head(19))
top_countries = top_countries.reset_index()
top_countries.columns = ['country','num_app']
top_countries = top_countries.drop(['num_app'], axis=1)
top_countries = top_countries['country'].str.strip().tolist()
top_countries[:5]

['USA', 'France', 'UK', 'India', 'Italy']

In [31]:
# Define function for finding country in top

def in_top_countries(country):
    if type(country) == int:
        country = []
    else: 
        country = country.split(',')
    return [1 if i in country else 0 for i in top_countries]

In [32]:
obj_columns['top_countries'] = obj_columns['country'].map(in_top_countries)

In [33]:
top_countries_array = pd.DataFrame(np.array(obj_columns['top_countries'].tolist()), columns=top_countries)
top_countries_array.head()

Unnamed: 0,USA,France,UK,India,Italy,Germany,Japan,Canada,Spain,Hong Kong,Turkey,Belgium,South Korea,Sweden,Australia,West Germany,Mexico,China,Denmark
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [34]:
# Languages
cleaned_language = pd.DataFrame(obj_columns.set_index('imdb_title_id').language.str.split(',', expand=True).stack())
cleaned_language = cleaned_language.reset_index()
cleaned_language.columns = ['imdb_title_id','order','language']
cleaned_language = cleaned_language.drop(['order'], axis=1)
cleaned_language = cleaned_language.language.str.strip()
cleaned_language.head()

0    English
1    Italian
2    English
3     German
4    Italian
Name: language, dtype: object

In [35]:
cleaned_language_count = pd.Series(cleaned_language.value_counts())
cleaned_language_count.mean()

392.90909090909093

In [36]:
threshold_l = cleaned_language_count >= 500
threshold_l.value_counts()

False    238
True      26
Name: language, dtype: int64

In [37]:
top_languages = pd.DataFrame(cleaned_language_count.head(26))
top_languages = top_languages.reset_index()
top_languages.columns = ['language','num_app']
top_languages = top_languages.drop(['num_app'], axis=1)
top_languages = top_languages['language'].str.strip().tolist()
top_languages[:5]

['English', 'French', 'Spanish', 'Italian', 'German']

In [38]:
# Define function for finding language in top

def in_top_languages(language):
    if type(language) == int:
        language = []
    else: 
        language = language.split(',')
    return [1 if i in language else 0 for i in top_languages]

In [39]:
obj_columns['top_languages'] = obj_columns['language'].map(in_top_languages)

In [40]:
top_languages_array = pd.DataFrame(np.array(obj_columns['top_languages'].tolist()), columns=top_languages)
top_languages_array.head()

Unnamed: 0,English,French,Spanish,Italian,German,Japanese,Hindi,Russian,Mandarin,Cantonese,...,Telugu,Polish,Dutch,Malayalam,Danish,Greek,Persian,Finnish,Czech,Hungarian
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
# Writers
cleaned_writer = pd.DataFrame(obj_columns.set_index('imdb_title_id').writer.str.split(',', expand=True).stack())
cleaned_writer = cleaned_writer.reset_index()
cleaned_writer.columns = ['imdb_title_id','order','writer']
cleaned_writer = cleaned_writer.drop(['order'], axis=1)
cleaned_writer = cleaned_writer.writer.str.strip()
cleaned_writer.head()

0                  Charles Tait
1                     Urban Gad
2    Gebhard Schätzler-Perasini
3              Victorien Sardou
4               Dante Alighieri
Name: writer, dtype: object

In [42]:
cleaned_writer_count = pd.Series(cleaned_writer.value_counts())
cleaned_writer_count.head()

Jing Wong              107
Kuang Ni                97
Jesús Franco            89
William Shakespeare     85
Robin Bhatt             71
Name: writer, dtype: int64

In [43]:
threshold_w = cleaned_writer_count > 35
threshold_w.value_counts()

False    58324
True        62
Name: writer, dtype: int64

In [44]:
top_writers = pd.DataFrame(cleaned_writer_count.head(62))
top_writers = top_writers.reset_index()
top_writers.columns = ['writer','num_app']
top_writers = top_writers.drop(['num_app'], axis=1)
top_writers = top_writers['writer'].str.strip().tolist()
top_writers[:5]

['Jing Wong', 'Kuang Ni', 'Jesús Franco', 'William Shakespeare', 'Robin Bhatt']

In [45]:
# Define function for finding writers in top

def in_top_writers(writer):
    if type(writer) == int:
        writer = []
    else: 
        writer = writer.split(',')
    return [1 if i in writer else 0 for i in top_writers]

In [46]:
obj_columns['top_writers'] = obj_columns['writer'].map(in_top_writers)

In [47]:
top_writers_array = pd.DataFrame(np.array(obj_columns['top_writers'].tolist()), columns=top_writers)
top_writers_array.head()

Unnamed: 0,Jing Wong,Kuang Ni,Jesús Franco,William Shakespeare,Robin Bhatt,Javed Akhtar,Leonardo Benvenuti,Michel Audiard,Sachin Bhowmick,Jean-Claude Carrière,...,John Twist,Pascal Bonitzer,Harry Alan Towers,Dalton Trumbo,Hing-Ka Chan,Dardano Sacchetti,Ingmar Bergman,John Lee Mahin,Blake Edwards,Fujio F. Fujiko
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
# Production Company
cleaned_company = pd.DataFrame(obj_columns.set_index('imdb_title_id').production_company.str.split(',', expand=True).stack())
cleaned_company = cleaned_company.reset_index()
cleaned_company.columns = ['imdb_title_id','order','company']
cleaned_company = cleaned_company.drop(['order'], axis=1)
cleaned_company = cleaned_company.company.str.strip()
cleaned_company.head()

0                   J. and N. Tait
1                         Fotorama
2    Helen Gardner Picture Players
3                      Milano Film
4                    Kalem Company
Name: company, dtype: object

In [49]:
cleaned_company_count = pd.Series(cleaned_company.value_counts())
cleaned_company_count.head()

Metro-Goldwyn-Mayer (MGM)    1290
Warner Bros.                 1148
Columbia Pictures             898
Paramount Pictures            884
Twentieth Century Fox         866
Name: company, dtype: int64

In [50]:
threshold_pc = cleaned_company_count >= 70
threshold_pc.value_counts()

False    30137
True        51
Name: company, dtype: int64

In [51]:
top_company = pd.DataFrame(cleaned_company_count.head(51))
top_company = top_company.reset_index()
top_company.columns = ['company','num_app']
top_company = top_company.drop(['num_app'], axis=1)
top_company = top_company['company'].str.strip().tolist()
top_company[:5]

['Metro-Goldwyn-Mayer (MGM)',
 'Warner Bros.',
 'Columbia Pictures',
 'Paramount Pictures',
 'Twentieth Century Fox']

In [52]:
# Define function for finding companies in top

def in_top_companies(company):
    if type(company) == int:
        company = []
    else: 
        company = company.split(',')
    return [1 if i in company else 0 for i in top_company]

In [53]:
obj_columns['top_company'] = obj_columns['production_company'].map(in_top_companies)

In [54]:
top_companies_array = pd.DataFrame(np.array(obj_columns['top_company'].tolist()), columns=top_company)
top_companies_array.head()

Unnamed: 0,Metro-Goldwyn-Mayer (MGM),Warner Bros.,Columbia Pictures,Paramount Pictures,Twentieth Century Fox,Universal Pictures,RKO Radio Pictures,Universal International Pictures (UI),Mosfilm,Canal+,...,Avala Film,Nordisk Film,Hammer Films,BBC Films,Lionsgate,Kinostudiya imeni M. Gorkogo,Orion Pictures,Bandai Visual Company,EuropaCorp,Dimension Films
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [59]:
# Bringing in our cleaned dataframe of all the non-object columns from our Rate Prediction System-iMDb-1 notebook
len(float_imdb.columns)

57

In [56]:
obj_imdb = pd.concat([top_actors_array, top_directors_array, top_genres_array, top_countries_array, 
                       top_languages_array, top_writers_array, top_companies_array], axis=1)
obj_imdb.head(2)

Unnamed: 0,Brahmanandam,Anupam Kher,Nassar,Eric Roberts,Prakash Raj,Amitabh Bachchan,Gérard Depardieu,Mohanlal,Tanikella Bharani,John Carradine,...,Avala Film,Nordisk Film,Hammer Films,BBC Films,Lionsgate,Kinostudiya imeni M. Gorkogo,Orion Pictures,Bandai Visual Company,EuropaCorp,Dimension Films
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
%store obj_imdb

Stored 'obj_imdb' (DataFrame)


In [60]:
# Concatenating all of our dataframes to be put into our model
test_imdb = pd.concat([float_imdb, top_actors_array, top_directors_array, top_genres_array, top_countries_array, 
                       top_languages_array, top_writers_array, top_companies_array], axis=1)
test_imdb.head(2)

Unnamed: 0,year,duration,avg_vote,votes,metascore,reviews_from_users,reviews_from_critics,weighted_average_vote,total_votes,mean_vote,...,Avala Film,Nordisk Film,Hammer Films,BBC Films,Lionsgate,Kinostudiya imeni M. Gorkogo,Orion Pictures,Bandai Visual Company,EuropaCorp,Dimension Films
0,1906.0,70.0,6.1,537.0,0.0,7.0,7.0,6.1,537.0,6.3,...,0,0,0,0,0,0,0,0,0,0
1,1911.0,53.0,5.9,171.0,0.0,4.0,2.0,5.9,171.0,6.1,...,0,0,0,0,0,0,0,0,0,0


In [217]:
X_1 = test_imdb.drop(['avg_vote', 'imdb_title_id', 'title'], axis=1)
X_1.head(2)

Unnamed: 0,year,duration,votes,metascore,reviews_from_users,reviews_from_critics,weighted_average_vote,total_votes,mean_vote,median_vote,...,Avala Film,Nordisk Film,Hammer Films,Lionsgate,BBC Films,Orion Pictures,Kinostudiya imeni M. Gorkogo,Bandai Visual Company,EuropaCorp,Dimension Films
0,1906.0,70.0,537.0,0.0,7.0,7.0,6.1,537.0,6.3,6.0,...,0,0,0,0,0,0,0,0,0,0
1,1911.0,53.0,171.0,0.0,4.0,2.0,5.9,171.0,6.1,6.0,...,0,0,0,0,0,0,0,0,0,0


In [218]:
Y_1 = test_imdb['avg_vote']
Y_1.head(2)

0    6.1
1    5.9
Name: avg_vote, dtype: float64

In [219]:
X_train, X_test, y_train, y_test = train_test_split(X_1, Y_1, test_size=0.33, random_state=42)

In [220]:
model_0 = XGBClassifier()

In [221]:
model_0.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [222]:
# Make predictions on model
y_pred = model_0.predict(X_test)

In [223]:
# Compute RMSE 
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse.round(5)))

Root Mean Squared Error: 0.00334


In [224]:
# Predicting Movies with XGB
top20_movies = X_1.sort_values(by = ['weighted_average_vote'], ascending=False, axis=0).head(20)
top20_movies.head()

Unnamed: 0,year,duration,votes,metascore,reviews_from_users,reviews_from_critics,weighted_average_vote,total_votes,mean_vote,median_vote,...,Avala Film,Nordisk Film,Hammer Films,Lionsgate,BBC Films,Orion Pictures,Kinostudiya imeni M. Gorkogo,Bandai Visual Company,EuropaCorp,Dimension Films
52002,2019.0,132.0,587.0,0.0,11.0,0.0,10.0,587.0,9.6,10.0,...,0,0,0,0,0,0,0,0,0,0
77908,2019.0,100.0,2360.0,0.0,6.0,10.0,10.0,2360.0,10.0,10.0,...,0,0,0,0,0,0,0,0,0,0
81138,2019.0,138.0,425.0,0.0,2.0,0.0,9.8,425.0,9.5,10.0,...,0,0,0,0,0,0,0,0,0,0
51300,2019.0,131.0,133.0,0.0,0.0,0.0,9.7,133.0,9.3,10.0,...,0,0,0,0,0,0,0,0,0,0
81133,2019.0,140.0,1176.0,0.0,24.0,1.0,9.6,1176.0,9.7,10.0,...,0,0,0,0,0,0,0,0,0,0


In [225]:
# Generating predictions
y_top20_movies = model_0.predict(top20_movies)
y_top20_movies

array([ 9.5, 10. ,  9.8,  9.5,  9.6,  9.6,  9.5,  9.5,  9.5,  9.4,  9.4,
        9.4,  9.4,  9.4,  9.4,  9.4,  9.2,  9.3,  9.3,  9.2])

In [226]:
# Creating a dataframe out of our predictions
y_top20_movies = pd.DataFrame(y_top20_movies)
y_top20_movies.columns = ['Predicted Avg Score']
y_top20_movies.head(2)

Unnamed: 0,Predicted Avg Score
0,9.5
1,10.0


In [227]:
# Creating a dataframe of our titles that our predictions were based on
movie_names = new_imdb.sort_values(by = ['weighted_average_vote'], ascending=False, axis=0).head(20)
movie_names = pd.DataFrame(movie_names['title'])
movie_names = movie_names.reset_index()
movie_names = movie_names.drop('index', axis=1)
movie_names.head()

Unnamed: 0,title
0,Kirket
1,Love in Kilnerry
2,Gini Helida Kathe
3,Runam
4,Android Kunjappan Version 5.25


In [228]:
# Creating a dataframe of the actual average votes to compare to our predictions
actual_avg_vote = new_imdb.sort_values(by = ['weighted_average_vote'], ascending=False, axis=0).head(20)
actual_avg_vote = pd.DataFrame(actual_avg_vote['avg_vote'])
actual_avg_vote = actual_avg_vote.reset_index()
actual_avg_vote = actual_avg_vote.drop('index', axis=1)
actual_avg_vote.head()

Unnamed: 0,avg_vote
0,10.0
1,10.0
2,9.8
3,9.7
4,9.6


In [229]:
# Bringing all of our dataframes together to visualize how our predictions compare
movie_names_pred = pd.concat([movie_names,actual_avg_vote,y_top20_movies], axis=1)
movie_names_pred

Unnamed: 0,title,avg_vote,Predicted Avg Score
0,Kirket,10.0,9.5
1,Love in Kilnerry,10.0,10.0
2,Gini Helida Kathe,9.8,9.8
3,Runam,9.7,9.5
4,Android Kunjappan Version 5.25,9.6,9.6
5,Fan,9.6,9.6
6,The Brighton Miracle,9.5,9.5
7,Yeh Suhaagraat Impossible,9.5,9.5
8,Safe,9.5,9.5
9,Ananthu V/S Nusrath,9.4,9.4
