In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
# Importing Data
%store -r filmtv_movies

In [3]:
# What is our Target Variable? 
# avg_vote
# FilmTv length 53,497
filmtv_movies.head()

Unnamed: 0,filmtv_id,title,year,genre,duration,country,director,actors,avg_vote,votes,description,notes
0,2,Bugs Bunny's Third Movie: 1001 Rabbit Tales,1982.0,Animation,76,United States,"David Detiege, Art Davis, Bill Perez",0,7.7,31,"With two protruding incisors, a little crafty ...","These are many small independent stories, whic..."
1,3,18 anni tra una settimana,1991.0,Drama,98,Italy,Luigi Perelli,"Kim Rossi Stuart, Simona Cavallari, Ennio Fant...",7.0,3,"Samantha, not yet eighteen, abandons the comfo...","Luigi Perelli, the director of ""Piovra"", occas..."
2,17,Ride a Wild Pony,1976.0,Romantic,91,United States,Don Chaffey,"Michael Craig, John Meillon, Eva Griffith, Gra...",5.5,10,In the Australia of the pioneers a boy and a g...,"""Ecological"" fable with a happy ending not wit..."
3,18,Diner,1982.0,Comedy,95,United States,Barry Levinson,"Mickey Rourke, Steve Guttenberg, Ellen Barkin,...",7.2,15,Five boys from Baltimore are in the habit of m...,A cast of will be famous for Levinson's direct...
4,20,A che servono questi quattrini?,1942.0,Comedy,85,Italy,Esodo Pratelli,"Eduardo De Filippo, Peppino De Filippo, Clelia...",5.8,12,"With a trick, the Marquis Parascandolo pennile...",Taken from the theatrical piece by Armando Cur...


In [4]:
filmtv_movies.actors = filmtv_movies.actors.str.strip()
filmtv_movies.director = filmtv_movies.director.str.strip()
filmtv_movies.actors = filmtv_movies.actors.replace(np.nan, 0)
filmtv_movies.director = filmtv_movies.director.replace(np.nan, 0)

In [5]:
filmtv_movies.head(3)

Unnamed: 0,filmtv_id,title,year,genre,duration,country,director,actors,avg_vote,votes,description,notes
0,2,Bugs Bunny's Third Movie: 1001 Rabbit Tales,1982.0,Animation,76,United States,"David Detiege, Art Davis, Bill Perez",0,7.7,31,"With two protruding incisors, a little crafty ...","These are many small independent stories, whic..."
1,3,18 anni tra una settimana,1991.0,Drama,98,Italy,Luigi Perelli,"Kim Rossi Stuart, Simona Cavallari, Ennio Fant...",7.0,3,"Samantha, not yet eighteen, abandons the comfo...","Luigi Perelli, the director of ""Piovra"", occas..."
2,17,Ride a Wild Pony,1976.0,Romantic,91,United States,Don Chaffey,"Michael Craig, John Meillon, Eva Griffith, Gra...",5.5,10,In the Australia of the pioneers a boy and a g...,"""Ecological"" fable with a happy ending not wit..."


In [6]:
# Cleaning actor column to encode
cleaned_actors = pd.DataFrame(filmtv_movies.set_index('filmtv_id').actors.str.split(',', expand=True).stack())
cleaned_actors = cleaned_actors.reset_index()
cleaned_actors.columns = ['filmtv_id','order','actors']
cleaned_actors = cleaned_actors.drop(['order'], axis=1)
cleaned_actors = cleaned_actors.actors.str.strip()
cleaned_actors.head()

0       Kim Rossi Stuart
1       Simona Cavallari
2    Ennio Fantastichini
3    Orso Maria Guerrini
4            Silli Togni
Name: actors, dtype: object

In [8]:
top_actors = pd.DataFrame(cleaned_actors_count.head(50))
top_actors = top_actors.reset_index()
top_actors.columns = ['actors','num_app']
top_actors = top_actors.drop(['num_app'], axis=1)
top_actors = top_actors['actors'].str.strip().tolist()
top_actors[:5]

['Alberto Sordi',
 'Gérard Depardieu',
 'Marcello Mastroianni',
 'Ugo Tognazzi',
 'Franco Nero']

In [9]:
# Define function for finding actors in top 50

def in_top_actors(actors):
    if type(actors) == int:
        actors = []
    else: 
        actors = actors.split(',')
    return [1 if i in actors else 0 for i in top_actors]

In [10]:
filmtv_movies['top_actors'] = filmtv_movies['actors'].map(in_top_actors)

In [11]:
top_actors_array = pd.DataFrame(np.array(filmtv_movies['top_actors'].tolist()), columns=top_actors)
top_actors_array.head()

Unnamed: 0,Alberto Sordi,Gérard Depardieu,Marcello Mastroianni,Ugo Tognazzi,Franco Nero,Christopher Lee,Antonella Lualdi,Donald Sutherland,Totò,Ciccio Ingrassia,...,Isabelle Huppert,James Mason,Christopher Plummer,Nicolas Cage,Ernest Borgnine,Claudia Cardinale,Enrico Maria Salerno,Mario Carotenuto,Malcolm McDowell,Anthony Hopkins
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Cleaning top 50 directors
cleaned_directors = pd.DataFrame(filmtv_movies.set_index('filmtv_id').director.str.split(',', expand=True).stack())
cleaned_directors = cleaned_directors.reset_index()
cleaned_directors.columns = ['filmtv_id','order','director']
cleaned_directors = cleaned_directors.drop(['order'], axis=1)
cleaned_directors = cleaned_directors.director.str.strip()
cleaned_directors.head()

0    David Detiege
1        Art Davis
2       Bill Perez
3    Luigi Perelli
4      Don Chaffey
Name: director, dtype: object

In [15]:
top_directors = pd.DataFrame(cleaned_directors_count.head(50))
top_directors = top_directors.reset_index()
top_directors.columns = ['directors','num_app']
top_directors = top_directors.drop(['num_app'], axis=1)
top_directors = top_directors['directors'].str.strip().tolist()
top_directors[:5]

['Steno', 'Mario Mattòli', 'John Ford', 'Carlo Vanzina', 'Umberto Lenzi']

In [16]:
# Define function for finding directors in top 50

def in_top_directors(directors):
    if type(directors) == int:
        directors = []
    else: 
        directors = directors.split(',')
    return [1 if i in directors else 0 for i in top_directors]

In [17]:
filmtv_movies['top_directors'] = filmtv_movies['director'].map(in_top_directors)

In [18]:
top_directors_array = pd.DataFrame(np.array(filmtv_movies['top_directors'].tolist()), columns=top_directors)
top_directors_array.head()

Unnamed: 0,Steno,Mario Mattòli,John Ford,Carlo Vanzina,Umberto Lenzi,Dino Risi,Takashi Miike,Michael Curtiz,Mario Monicelli,Jean-Luc Godard,...,Sidney Lumet,Dieter Kehler,George Sherman,Norman Taurog,Edgar Reitz,Mauro Bolognini,Rainer Werner Fassbinder,Kevin Connor,George Marshall,Alberto Sironi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
# Cleaning top 50 country
cleaned_country = pd.DataFrame(filmtv_movies.set_index('filmtv_id').country.str.split(',', expand=True).stack())
cleaned_country = cleaned_country.reset_index()
cleaned_country.columns = ['filmtv_id','order','country']
cleaned_country = cleaned_country.drop(['order'], axis=1)
cleaned_country = cleaned_country.country.str.strip()
cleaned_country.head()

0    United States
1            Italy
2    United States
3    United States
4            Italy
Name: country, dtype: object

In [47]:
cleaned_country_count = pd.Series(cleaned_country.value_counts())
cleaned_country_count.head(50).sum()

62733

In [48]:
top_country = pd.DataFrame(cleaned_country_count.head(50))
top_country = top_country.reset_index()
top_country.columns = ['country','num_app']
top_country = top_country.drop(['num_app'], axis=1)
top_country = top_country['country'].str.strip().tolist()
top_country[:5]

['United States', 'Italy', 'France', 'Great Britain', 'Germany']

In [49]:
# Define function for finding country in top 50

def in_top_country(country):
    if type(country) == int:
        country = []
    else: 
        country = country.split(',')
    return [1 if i in country else 0 for i in top_country]

In [50]:
filmtv_movies['top_country'] = filmtv_movies['country'].map(in_top_country)

In [51]:
top_country_array = pd.DataFrame(np.array(filmtv_movies['top_country'].tolist()), columns=top_country)
top_country_array.head()

Unnamed: 0,United States,Italy,France,Great Britain,Germany,Canada,Spain,Japan,Hong Kong,Australia,...,Greece,Taiwan,Chile,Thailand,Philippines,Yugoslavia,Czechoslovakia,Iceland,Bulgaria,Colombia
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
# Cleaning top genre
cleaned_genre = pd.DataFrame(filmtv_movies.set_index('filmtv_id').genre.str.split(',', expand=True).stack())
cleaned_genre = cleaned_genre.reset_index()
cleaned_genre.columns = ['filmtv_id','order','genre']
cleaned_genre = cleaned_genre.drop(['order'], axis=1)
cleaned_genre = cleaned_genre.genre.str.strip()
cleaned_genre.head()

0    Animation
1        Drama
2     Romantic
3       Comedy
4       Comedy
Name: genre, dtype: object

In [60]:
top_genre = pd.DataFrame(cleaned_genre_count.head(27))
top_genre = top_genre.reset_index()
top_genre.columns = ['genre','num_app']
top_genre = top_genre.drop(['num_app'], axis=1)
top_genre = top_genre['genre'].str.strip().tolist()
top_genre[:5]

['Drama', 'Comedy', 'Thriller', 'Documentary', 'Action']

In [61]:
# Define function for finding genre

def in_top_genre(genre):
    if type(genre) == int:
        genre = []
    else: 
        genre = genre.split(',')
    return [1 if i in genre else 0 for i in top_genre]

In [62]:
filmtv_movies['top_genre'] = filmtv_movies['genre'].map(in_top_genre)

In [63]:
top_genre_array = pd.DataFrame(np.array(filmtv_movies['top_genre'].tolist()), columns=top_genre)
top_genre_array.head()

Unnamed: 0,Drama,Comedy,Thriller,Documentary,Action,Horror,Adventure,Fantasy,Western,Animation,...,Noir,Sperimental,History,Mélo,Mythology,Super-hero,Gangster,Biblical,Short Movie,Sport
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [64]:
filmtv_movies.head(2)

Unnamed: 0,filmtv_id,title,year,genre,duration,country,director,actors,avg_vote,votes,description,notes,top_actors,top_directors,top_country,top_genre
0,2,Bugs Bunny's Third Movie: 1001 Rabbit Tales,1982.0,Animation,76,United States,"David Detiege, Art Davis, Bill Perez",0,7.7,31,"With two protruding incisors, a little crafty ...","These are many small independent stories, whic...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
1,3,18 anni tra una settimana,1991.0,Drama,98,Italy,Luigi Perelli,"Kim Rossi Stuart, Simona Cavallari, Ennio Fant...",7.0,3,"Samantha, not yet eighteen, abandons the comfo...","Luigi Perelli, the director of ""Piovra"", occas...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [65]:
filmtv_w_actors = pd.concat([top_actors_array, filmtv_movies], axis=1)
filmtv_w_actors.head(2)

Unnamed: 0,Alberto Sordi,Gérard Depardieu,Marcello Mastroianni,Ugo Tognazzi,Franco Nero,Christopher Lee,Antonella Lualdi,Donald Sutherland,Totò,Ciccio Ingrassia,...,director,actors,avg_vote,votes,description,notes,top_actors,top_directors,top_country,top_genre
0,0,0,0,0,0,0,0,0,0,0,...,"David Detiege, Art Davis, Bill Perez",0,7.7,31,"With two protruding incisors, a little crafty ...","These are many small independent stories, whic...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
1,0,0,0,0,0,0,0,0,0,0,...,Luigi Perelli,"Kim Rossi Stuart, Simona Cavallari, Ennio Fant...",7.0,3,"Samantha, not yet eighteen, abandons the comfo...","Luigi Perelli, the director of ""Piovra"", occas...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [66]:
filmtv_w_directors = pd.concat([top_directors_array, filmtv_w_actors], axis=1)
filmtv_w_directors.head(2)

Unnamed: 0,Steno,Mario Mattòli,John Ford,Carlo Vanzina,Umberto Lenzi,Dino Risi,Takashi Miike,Michael Curtiz,Mario Monicelli,Jean-Luc Godard,...,director,actors,avg_vote,votes,description,notes,top_actors,top_directors,top_country,top_genre
0,0,0,0,0,0,0,0,0,0,0,...,"David Detiege, Art Davis, Bill Perez",0,7.7,31,"With two protruding incisors, a little crafty ...","These are many small independent stories, whic...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
1,0,0,0,0,0,0,0,0,0,0,...,Luigi Perelli,"Kim Rossi Stuart, Simona Cavallari, Ennio Fant...",7.0,3,"Samantha, not yet eighteen, abandons the comfo...","Luigi Perelli, the director of ""Piovra"", occas...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [67]:
filmtv_w_country = pd.concat([top_country_array, filmtv_w_directors], axis=1)
filmtv_w_country.head(2)

Unnamed: 0,United States,Italy,France,Great Britain,Germany,Canada,Spain,Japan,Hong Kong,Australia,...,director,actors,avg_vote,votes,description,notes,top_actors,top_directors,top_country,top_genre
0,1,0,0,0,0,0,0,0,0,0,...,"David Detiege, Art Davis, Bill Perez",0,7.7,31,"With two protruding incisors, a little crafty ...","These are many small independent stories, whic...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
1,0,1,0,0,0,0,0,0,0,0,...,Luigi Perelli,"Kim Rossi Stuart, Simona Cavallari, Ennio Fant...",7.0,3,"Samantha, not yet eighteen, abandons the comfo...","Luigi Perelli, the director of ""Piovra"", occas...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [68]:
filmtv_complete = pd.concat([top_genre_array, filmtv_w_country], axis=1)
filmtv_complete.head(2)

Unnamed: 0,Drama,Comedy,Thriller,Documentary,Action,Horror,Adventure,Fantasy,Western,Animation,...,director,actors,avg_vote,votes,description,notes,top_actors,top_directors,top_country,top_genre
0,0,0,0,0,0,0,0,0,0,1,...,"David Detiege, Art Davis, Bill Perez",0,7.7,31,"With two protruding incisors, a little crafty ...","These are many small independent stories, whic...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
1,1,0,0,0,0,0,0,0,0,0,...,Luigi Perelli,"Kim Rossi Stuart, Simona Cavallari, Ennio Fant...",7.0,3,"Samantha, not yet eighteen, abandons the comfo...","Luigi Perelli, the director of ""Piovra"", occas...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [25]:
len(filmtv_complete.columns.tolist())

114

In [71]:
filmtv_complete_ohe = filmtv_complete.drop(['director', 'actors', 'country', 'genre', 'description', 'notes', 
                                            'top_actors', 'top_directors', 'top_country', 'top_genre'], axis=1)
len(filmtv_complete_ohe.columns.tolist())

183

In [75]:
filmtv_complete_ohe_0 = filmtv_complete_ohe.drop(['title'], axis=1)

In [76]:
filmtv_complete_ohe_1 = pd.get_dummies(data=filmtv_complete_ohe_0)
filmtv_complete_ohe_1.head(2)

Unnamed: 0,Drama,Comedy,Thriller,Documentary,Action,Horror,Adventure,Fantasy,Western,Animation,...,Claudia Cardinale,Enrico Maria Salerno,Mario Carotenuto,Malcolm McDowell,Anthony Hopkins,filmtv_id,year,duration,avg_vote,votes
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,2,1982.0,76,7.7,31
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,3,1991.0,98,7.0,3


In [78]:
X_1 = filmtv_complete_ohe_1.drop(['avg_vote'], axis=1)
X_1.head()

Unnamed: 0,Drama,Comedy,Thriller,Documentary,Action,Horror,Adventure,Fantasy,Western,Animation,...,Ernest Borgnine,Claudia Cardinale,Enrico Maria Salerno,Mario Carotenuto,Malcolm McDowell,Anthony Hopkins,filmtv_id,year,duration,votes
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,2,1982.0,76,31
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,3,1991.0,98,3
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,17,1976.0,91,10
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,18,1982.0,95,15
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,20,1942.0,85,12


In [79]:
Y_1 = filmtv_complete_ohe_1['avg_vote']
Y_1.head()

0    7.7
1    7.0
2    5.5
3    7.2
4    5.8
Name: avg_vote, dtype: float64

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X_1, Y_1, test_size=0.33, random_state=42)

In [81]:
model_0 = XGBClassifier()

In [82]:
model_0.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [83]:
# Make predictions on model
y_pred = model_0.predict(X_test)
#predictions = [round(value) for value in y_pred]

In [84]:
# Compute RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse.round(5)))

Root Mean Squared Error: 1.45148


In [85]:
# Predicting Movies with XGB
top20_movies_0 = X_1[:20]
y_top20_movies_0 = model_0.predict(top20_movies_0)
# Creating DataFrame to Visualize Prediction Results
y_top20_movies_0 = pd.DataFrame(y_top20_movies_0)
movie_names_0 = filmtv_complete_ohe['title'][:20]
actual_avg_vote_0 = filmtv_complete_ohe['avg_vote'][:20]
frames_0 = [movie_names_0, y_top20_movies_0, actual_avg_vote_0]
movie_names_pred_0 = pd.concat(frames_0, axis=1)
movie_names_pred_0.columns = ['Title', 'Predicted Avg Vote', 'Avg Vote']
movie_names_pred_0.sort_values(['Predicted Avg Vote'],ascending=False)

Unnamed: 0,Title,Predicted Avg Vote,Avg Vote
6,A ciascuno il suo,8.1,7.8
14,In Bed With Madonna,7.7,5.1
16,Bowery at Midnight,7.2,5.8
10,A Ghentar si muore facile,6.0,5.5
11,Does This Mean We are Married?,6.0,4.0
15,Make Mine Mink,6.0,8.0
13,Love Kills,6.0,8.0
19,Crazy People,6.0,6.0
5,The Uranian Conspiracy,6.0,5.0
3,Diner,6.0,7.2
