In [128]:
import pandas as pd
import numpy as np
from pprint import pprint

In [129]:
# Importando dados
movies_df = pd.read_csv('../filmes.csv')

In [130]:
# Visualizando colunas do dataset
variables = list(movies_df.columns.values)
variables_str = '\n'.join(sorted(variables))
print('Variáveis:\n{}'.format(variables_str))
movies_df.head()
len(movies_df.columns)

Variáveis:
actor_1_facebook_likes
actor_1_name
actor_2_facebook_likes
actor_2_name
actor_3_facebook_likes
actor_3_name
aspect_ratio
budget
cast_total_facebook_likes
color
content_rating
country
director_facebook_likes
director_name
duration
facenumber_in_poster
genres
gross
imdb_score
language
movie_facebook_likes
movie_imdb_link
movie_title
num_critic_for_reviews
num_user_for_reviews
num_voted_users
plot_keywords
title_year


28

## 1. Deletando entradas com 1/3 dos valores ausentes

In [131]:
from math import ceil

num_variables = len(variables)
threshold = ceil((1/3)*num_variables)
print("Número de variáveis: {}. Limite de {} valores ausentes".format(num_variables, threshold))

    
# 1. Substituir 0's por NaN's
movies_df = movies_df.applymap(lambda x: np.nan if x == 0.0 else x)
size = len(movies_df)
# 2. Contabilizar NaNs para cada entrada
count_missing = movies_df.isnull().sum(axis='columns')
# 3. Remover
movies_df = movies_df[count_missing < threshold]
print("{} de {} entradas removidas".format(size-len(movies_df), size))

Número de variáveis: 28. Limite de 10 valores ausentes
16 de 5043 entradas removidas


## 2.0 Separando variáveis categóricas das variáveis numéricas

In [132]:
categorical_variables = ['actor_1_name', 'actor_2_name', 'actor_3_name', 'country', 'director_name', 'genres', 'language', 'movie_title', 'movie_imdb_link', 'plot_keywords', 'color', 'content_rating']
numerical_variables = [v for v in variables if v not in categorical_variables]

categorical_df = movies_df[categorical_variables]
numerical_df = movies_df[numerical_variables]

## 3. Processando variáveis categóricas

In [133]:
# Descrição do dataset categórico puro
categorical_df.describe()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,genres,language,movie_title,movie_imdb_link,plot_keywords,color,content_rating
count,5025,5023,5016,5024,4928,5027,5018,5027,5027,4881,5011,4739
unique,2086,3026,3518,64,2387,911,47,4901,4903,4751,2,18
top,Robert De Niro,Morgan Freeman,John Heard,USA,Steven Spielberg,Drama,English,Halloween,http://www.imdb.com/title/tt2638144/?ref_=fn_t...,based on novel,Color,R
freq,49,20,8,3796,26,236,4693,3,3,4,4802,2118


In [134]:
# Amostra do dataset
categorical_df.head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,genres,language,movie_title,movie_imdb_link,plot_keywords,color,content_rating
0,CCH Pounder,Joel David Moore,Wes Studi,USA,James Cameron,Action|Adventure|Fantasy|Sci-Fi,English,Avatar,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,avatar|future|marine|native|paraplegic,Color,PG-13
1,Johnny Depp,Orlando Bloom,Jack Davenport,USA,Gore Verbinski,Action|Adventure|Fantasy,English,Pirates of the Caribbean: At World's End,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,goddess|marriage ceremony|marriage proposal|pi...,Color,PG-13
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,UK,Sam Mendes,Action|Adventure|Thriller,English,Spectre,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,bomb|espionage|sequel|spy|terrorist,Color,PG-13
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,USA,Christopher Nolan,Action|Thriller,English,The Dark Knight Rises,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,deception|imprisonment|lawlessness|police offi...,Color,PG-13
5,Daryl Sabara,Samantha Morton,Polly Walker,USA,Andrew Stanton,Action|Adventure|Sci-Fi,English,John Carter,http://www.imdb.com/title/tt0401729/?ref_=fn_t...,alien|american civil war|male nipple|mars|prin...,Color,PG-13


### 3.0 Uniformizando o texto

In [135]:
# Tirando todos os espaços em branco antes e depois das palavras
categorical_df = categorical_df.applymap(lambda x: x.strip() if type(x) is str else x)

### 3.1 Tratando valores ausentes
Estratégias:
1. Nomes dos atores: Deixar ausente
2. País: 
3. Diretor
4. Idioma:
5. Título: deixar ausente
6. Keywords: deixar ausente
8. Link do imdb: deixar ausente
9. Cor: colorido se número de likes no fb != nan

#### Atores

In [136]:
missing = categorical_df[categorical_df['actor_1_name'].isna() & categorical_df['actor_2_name'].isna() | categorical_df['actor_3_name'].isna()]
missing.describe()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,genres,language,movie_title,movie_imdb_link,plot_keywords,color,content_rating
count,9,7,0.0,11,11,11,11,11,11,9,11,8
unique,9,7,0.0,7,11,7,2,11,11,9,1,5
top,Jacques Perrin,Bobby Kendall,,USA,Jacques Perrin,Documentary,English,All Is Lost,http://www.imdb.com/title/tt0302674/?ref_=fn_t...,boat|container|sea|shipping container|storm,Color,Not Rated
freq,1,1,,5,1,5,10,1,1,1,11,3


#### País

In [137]:
country_df = categorical_df.country;
country_df.describe()

count     5024
unique      64
top        USA
freq      3796
Name: country, dtype: object

In [138]:
print("Tabela de Frequências")
country_df.value_counts().head()

Tabela de Frequências


USA        3796
UK          447
France      154
Canada      126
Germany      97
Name: country, dtype: int64

In [139]:
## Valores Ausentes
categorical_df[categorical_df['country'].isna()]

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,genres,language,movie_title,movie_imdb_link,plot_keywords,color,content_rating
2370,Nicole 'Snooki' Polizzi,Jenni 'Jwoww' Farley,Paul 'Pauly D' DelVecchio,,,Comedy|Drama|Reality-TV|Romance,English,"Gone, Baby, Gone",http://www.imdb.com/title/tt1697237/?ref_=fn_t...,lifestyle,Color,TV-14
3397,Dominic Cooper,Joseph Gilgun,Ruth Negga,,,Adventure|Drama|Fantasy|Mystery,English,Preacher,http://www.imdb.com/title/tt5016504/?ref_=fn_t...,heaven and hell|preacher|supernatural|vampire|...,Color,TV-MA
4021,Chris Brochu,Jeff Fahey,Rita Wilson,,Daniel Petrie Jr.,Drama|Thriller,English,Dawn Patrol,http://www.imdb.com/title/tt2073661/?ref_=fn_t...,desert|held at gunpoint|marine|revenge|sex on ...,Color,


In [140]:
# Ajustando todos valores ausentes manualmente, baseando-se no país
categorical_df.loc[2370, 'country'] = "USA"
categorical_df.loc[2370, 'director_name'] = "Ben Affleck"

categorical_df.loc[3397, 'country'] = "USA"
categorical_df.loc[3397, 'director_name'] = "Sam Catlin"

categorical_df.loc[4021, 'country'] = "USA"

#### Diretor

In [141]:
categorical_df.loc[categorical_df['director_name'].isna(), 'movie_title']

177                     Miami Vice
260                     The A-Team
404                       Hannibal
459                      Daredevil
479                      Bewitched
537                    Constantine
543                           Life
593                  Sleepy Hollow
645              Last Man Standing
685                    The Missing
757            Rules of Engagement
816     Sabrina, the Teenage Witch
826               Sex and the City
833               Anger Management
857                  Stargate SG-1
962                    Unforgotten
994               A Touch of Frost
1007                       Twisted
1023                      Defiance
1035                     Outlander
1133                  The Returned
1175                 McHale's Navy
1265                        Arthur
1346         3rd Rock from the Sun
1404                     Rush Hour
1444                 Hit the Floor
1499                        Luther
1620           Friday Night Lights
1659                

#### Gêneros e keywords (ambos OK)

In [142]:
genres = categorical_df['genres'].astype(str)
genres[genres=='nan']

Series([], Name: genres, dtype: object)

#### Idioma

In [143]:
categorical_df['language'].value_counts().head()

English     4693
French        73
Spanish       40
Hindi         28
Mandarin      26
Name: language, dtype: int64

In [144]:
categorical_df.loc[categorical_df['language'].isna()]

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,genres,language,movie_title,movie_imdb_link,plot_keywords,color,content_rating
3086,Jon Gries,Taylor Handley,Trent Ford,USA,Christopher Cain,Drama|History|Romance|Western,,September Dawn,http://www.imdb.com/title/tt0473700/?ref_=fn_t...,massacre|mormon|settler|utah|wagon train,Color,R
3539,Debi Derryberry,Kate Higgins,Cindy Robinson,USA,Richard Rich,Action|Adventure|Animation|Comedy|Drama|Family...,,Alpha and Omega 4: The Legend of the Saw Tooth...,http://www.imdb.com/title/tt4061848/?ref_=fn_t...,blindness|cave|spirit|wolf|wolf cub,,
3869,Sid Caesar,Dom DeLuise,Bernadette Peters,USA,Mel Brooks,Comedy|Romance,,Silent Movie,http://www.imdb.com/title/tt0075222/?ref_=fn_t...,black comedy|friend|modern silent movie|silent...,Color,PG
4110,William Morgan Sheppard,Kevin Gage,Brianna Brown,USA,Michael Landon Jr.,Drama|Family|Western,,Love's Abiding Joy,http://www.imdb.com/title/tt0785025/?ref_=fn_t...,19th century|faith|mayor|ranch|sheriff,Color,PG
4409,Matthew Ziff,T.J. Storm,Sam Medina,USA,John Stockwell,Action,,Kickboxer: Vengeance,http://www.imdb.com/title/tt3082898/?ref_=fn_t...,,,
4630,Justin Baldoni,Luke Perry,Leonor Varela,USA,Jonathan Meyers,Drama,,A Fine Step,http://www.imdb.com/title/tt1604100/?ref_=fn_t...,,,PG
4810,Lillian Gish,Mae Marsh,Walter Long,USA,D.W. Griffith,Drama|History|War,,Intolerance: Love's Struggle Throughout the Ages,http://www.imdb.com/title/tt0006864/?ref_=fn_t...,huguenot|intolerance|medicis|protestant|wedding,Black and White,Not Rated
4885,John Gilbert,Renée Adorée,Claire Adams,USA,King Vidor,Drama|Romance|War,,The Big Parade,http://www.imdb.com/title/tt0015624/?ref_=fn_t...,chewing gum|climbing a tree|france|translation...,Black and White,Not Rated
4958,Stephen Carr,Johnnie Walker,Mary Carr,USA,Harry F. Millarde,Crime|Drama,,Over the Hill to the Poorhouse,http://www.imdb.com/title/tt0011549/?ref_=fn_t...,family relationships|gang|idler|poorhouse|thief,Black and White,


In [145]:
#Todos os que faltam são americanos, logo são em inglês
categorical_df.loc[categorical_df['language'].isna(), 'language'] = 'English'

#### Classificação Indicativa

In [146]:
categorical_df.loc[categorical_df['content_rating'].isna()]

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,genres,language,movie_title,movie_imdb_link,plot_keywords,color,content_rating
98,Mark Chinnery,Shin'ya Tsukamoto,Atsuko Maeda,Japan,Hideaki Anno,Action|Adventure|Drama|Horror|Sci-Fi,Japanese,Godzilla Resurgence,http://www.imdb.com/title/tt4262980/?ref_=fn_t...,blood|godzilla|monster|sequel,Color,
199,Rupert Grint,Dave Legeno,Ralph Ineson,UK,Matt Birch,Action|Fantasy,English,Harry Potter and the Deathly Hallows: Part II,http://www.imdb.com/title/tt1680310/?ref_=fn_t...,,Color,
204,Mark Chinnery,Shin'ya Tsukamoto,Atsuko Maeda,Japan,Hideaki Anno,Action|Adventure|Drama|Horror|Sci-Fi,Japanese,Godzilla Resurgence,http://www.imdb.com/title/tt4262980/?ref_=fn_t...,blood|godzilla|monster|sequel,Color,
206,Rupert Grint,Toby Jones,Alfred Enoch,UK,Matt Birch,Fantasy,English,Harry Potter and the Deathly Hallows: Part I,http://www.imdb.com/title/tt1571403/?ref_=fn_t...,,Color,
242,Alain Delon,Santiago Segura,Vanessa Hessler,France,Frédéric Forestier,Adventure|Comedy|Family|Fantasy,French,Asterix at the Olympic Games,http://www.imdb.com/title/tt0463872/?ref_=fn_t...,1st century b.c.|lightsaber|local blockbuster|...,Color,
489,Nissim Renard,Roxane Duran,Julie-Marie Parmentier,France,Lucile Hadzihalilovic,Drama|Horror|Mystery|Sci-Fi,French,Evolution,http://www.imdb.com/title/tt4291590/?ref_=fn_t...,boy|giving birth|nurse|sea|ultrasonography,Color,
543,Adam Arkin,Brent Sexton,Damian Lewis,USA,,Crime|Drama|Mystery,English,Life,http://www.imdb.com/title/tt0874936/?ref_=fn_t...,cop|murder|partner|police|protective male,Color,
685,Jason Flemyng,James Nesbitt,Frances O'Connor,UK,,Crime|Drama|Mystery,English,The Missing,http://www.imdb.com/title/tt3877200/?ref_=fn_t...,france|journalist|limp|police detective|reporter,Color,
810,Li Gong,Aaron Kwok,Eddie Peng,China,Pou-Soi Cheang,Action|Adventure|Fantasy,English,Xi you ji zhi: Sun Wukong san da Baigu Jing,http://www.imdb.com/title/tt4591310/?ref_=fn_t...,buddhism|demon|journey to the west|monk|monkey...,Color,
833,Barry Corbin,Noureen DeWulf,Brian Austin Green,USA,,Comedy|Romance,English,Anger Management,http://www.imdb.com/title/tt1986770/?ref_=fn_t...,anger management|argument|irony|sarcasm|therapist,Color,


In [147]:
rating_freq = categorical_df['content_rating'].value_counts()
rating_labels = rating_freq.index
rating_freqs = rating_freq.values
normalization_factor = sum(rating_freqs) #normalizando
weigths = [i/normalization_factor for i in rating_freqs]
rating_freq

R            2118
PG-13        1460
PG            701
Not Rated     116
G             112
Unrated        62
Approved       55
TV-14          30
TV-MA          20
X              13
TV-PG          13
TV-G           10
Passed          9
NC-17           7
GP              6
M               5
TV-Y            1
TV-Y7           1
Name: content_rating, dtype: int64

In [148]:
# Estratégia: Atribuir a classificação mais frequente de cada diretor
from numpy.random import choice
missing = categorical_df.loc[categorical_df['content_rating'].isna()]
for index, row in missing.iterrows():
    ratings = categorical_df.loc[categorical_df['director_name'] == row['director_name'], 'content_rating'].value_counts()
    if len(ratings)> 0:
        rating = ratings.index[0]
    else:
        rating = choice(rating_labels, p=weigths) # se não achar realiza sorteio poderado com frequência
    categorical_df.loc[index, 'content_rating'] = rating

#### Cor

In [60]:
# Visualização dos dados antes de tratar dados ausentes
color_df = categorical_df['color']
print("Frequências:\n{}".format(color_df.astype(str).value_counts()))
animations = movies_df['genres'].apply(lambda x: "Animation" in x if type(x) is str else x)

Frequências:
Color               4802
 Black and White     209
nan                   16
Name: color, dtype: int64


0       False
1       False
2       False
3       False
5       False
6       False
7        True
8       False
9       False
10      False
11      False
12      False
13      False
14      False
15      False
16      False
17      False
18      False
19      False
20      False
21      False
22      False
23      False
24      False
25      False
26      False
27      False
28      False
29      False
30      False
        ...  
5012    False
5013    False
5014    False
5015    False
5016    False
5017    False
5018    False
5019    False
5020    False
5021    False
5022    False
5023    False
5024    False
5025    False
5026    False
5027    False
5028    False
5029    False
5030    False
5031    False
5032    False
5033    False
5034    False
5035    False
5036    False
5037    False
5038    False
5039    False
5041    False
5042    False
Name: genres, Length: 5027, dtype: bool

In [26]:
movies_df.loc[movies_df['color'] == 'Color', 'title_year'].value_counts()
movies_df.loc[movies_df['title_year'] == 1937, 'movie_title']
movies_df.loc[4449] # Animações dps de 1937 são coloridas
movies_df[movies_df['']]

color                                                                    Color
director_name                                                 William Cottrell
num_critic_for_reviews                                                     145
duration                                                                    83
director_facebook_likes                                                    NaN
actor_3_facebook_likes                                                      31
actor_2_name                                                     Billy Gilbert
actor_1_facebook_likes                                                      82
gross                                                              1.84925e+08
genres                                        Animation|Family|Fantasy|Musical
actor_1_name                                                 Adriana Caselotti
movie_title                                   Snow White and the Seven Dwarfs 
num_voted_users                                     

In [149]:
#Tratando dados ausentes
is_color = numerical_df['movie_facebook_likes'] > 0

# Não tem valor e é colorido
color_df = categorical_df[is_color & categorical_df['color'].isna()]
color_df['color'] = 'Color'
categorical_df.update(color_df)

# Não tem valor e é preto e branco
color_df = categorical_df[~is_color & categorical_df['color'].isna()]
color_df['color'] = 'Black and White'

# Atualizando as entradas do dataframe
categorical_df.update(color_df)
categorical_df['color'].describe()

print("Valores presentes: {}".format(set(categorical_df['color'].values)))

Valores presentes: {'Color', 'Black and White'}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


### 3.2 Codificando colunas em formato de lista (gênero e keywords)

#### Gêneros

In [119]:
#Processando gênero
# Transformando string em lista
genres =  categorical_df.genres.apply(lambda x: x.replace('|', ' ').replace('-', ''))
genres.head()

0    Action Adventure Fantasy SciFi
1          Action Adventure Fantasy
2         Action Adventure Thriller
3                   Action Thriller
5            Action Adventure SciFi
Name: genres, dtype: object

In [120]:
vocabulary = set()
genres.apply(lambda x: vocabulary.update(x.split(' ')))

from sklearn.feature_extraction.text import CountVectorizer
genre_vectorizer = CountVectorizer()
genre_vectorizer.fit(vocabulary)
vocabulary = sorted(genre_vectorizer.vocabulary_)
vocabulary

['action',
 'adventure',
 'animation',
 'biography',
 'comedy',
 'crime',
 'documentary',
 'drama',
 'family',
 'fantasy',
 'filmnoir',
 'history',
 'horror',
 'music',
 'musical',
 'mystery',
 'news',
 'realitytv',
 'romance',
 'scifi',
 'short',
 'sport',
 'thriller',
 'war',
 'western']

In [126]:
vect_genres = genre_vectorizer.transform(genres.values)
genres_binary_df = pd.DataFrame(vect_genres.toarray())
genres_binary_df.rename(lambda x: 'genre_'+vocabulary[int(x)], axis='columns', inplace=True)
genres_binary_df.head()

Unnamed: 0,genre_action,genre_adventure,genre_animation,genre_biography,genre_comedy,genre_crime,genre_documentary,genre_drama,genre_family,genre_fantasy,...,genre_mystery,genre_news,genre_realitytv,genre_romance,genre_scifi,genre_short,genre_sport,genre_thriller,genre_war,genre_western
0,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [150]:
# Colocando de volta no Dataframe
categorical_df.drop('genres', axis='columns', inplace=True)
categorical_df = categorical_df.join(genres_binary_df)
categorical_df.head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,language,movie_title,movie_imdb_link,plot_keywords,color,...,genre_mystery,genre_news,genre_realitytv,genre_romance,genre_scifi,genre_short,genre_sport,genre_thriller,genre_war,genre_western
0,CCH Pounder,Joel David Moore,Wes Studi,USA,James Cameron,English,Avatar,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,avatar|future|marine|native|paraplegic,Color,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,Johnny Depp,Orlando Bloom,Jack Davenport,USA,Gore Verbinski,English,Pirates of the Caribbean: At World's End,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,goddess|marriage ceremony|marriage proposal|pi...,Color,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,UK,Sam Mendes,English,Spectre,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,bomb|espionage|sequel|spy|terrorist,Color,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,USA,Christopher Nolan,English,The Dark Knight Rises,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,deception|imprisonment|lawlessness|police offi...,Color,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,Daryl Sabara,Samantha Morton,Polly Walker,USA,Andrew Stanton,English,John Carter,http://www.imdb.com/title/tt0401729/?ref_=fn_t...,alien|american civil war|male nipple|mars|prin...,Color,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Keywords

In [18]:
# Processando keywords
keywords = categorical_df.plot_keywords.apply(lambda x: x.split('|') if type(x) is str else 'None')
keywords.head()

0         [avatar, future, marine, native, paraplegic]
1    [goddess, marriage ceremony, marriage proposal...
2            [bomb, espionage, sequel, spy, terrorist]
3    [deception, imprisonment, lawlessness, police ...
5    [alien, american civil war, male nipple, mars,...
Name: plot_keywords, dtype: object

In [19]:
# Quebrando o array em colunas
keywords = keywords.apply(pd.Series)
keywords.rename(lambda x: 'keyword_{}'.format(x), axis='columns', inplace=True)
keywords = keywords.astype(str) # nan is the value for missing

In [20]:
#Convertendo as categorias para inteiros
all_keywords = set()
keywords.applymap(lambda x: all_keywords.add(x))
keywords_encoder = LabelEncoder()
keywords_encoder.fit(list(all_keywords))
keywords = keywords.apply(keywords_encoder.transform)
keywords.head()

Unnamed: 0,keyword_0,keyword_1,keyword_2,keyword_3,keyword_4
0,515,2866,4301,4759,5158
1,3026,4314,4317,5313,6522
2,875,2327,6334,6785,7202
3,1894,3565,3964,5405,7206
5,216,264,4249,4319,5544


In [21]:
# Colocando de volta no Dataframe
categorical_df.drop('plot_keywords', axis='columns', inplace=True)
categorical_df = categorical_df.join(keywords)
categorical_df.head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,language,movie_title,movie_imdb_link,color,genre_0,...,genre_3,genre_4,genre_5,genre_6,genre_7,keyword_0,keyword_1,keyword_2,keyword_3,keyword_4
0,CCH Pounder,Joel David Moore,Wes Studi,USA,James Cameron,English,Avatar,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,Color,0,...,19,25,25,25,25,515,2866,4301,4759,5158
1,Johnny Depp,Orlando Bloom,Jack Davenport,USA,Gore Verbinski,English,Pirates of the Caribbean: At World's End,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,Color,0,...,25,25,25,25,25,3026,4314,4317,5313,6522
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,UK,Sam Mendes,English,Spectre,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,Color,0,...,25,25,25,25,25,875,2327,6334,6785,7202
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,USA,Christopher Nolan,English,The Dark Knight Rises,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,Color,0,...,25,25,25,25,25,1894,3565,3964,5405,7206
5,Daryl Sabara,Samantha Morton,Polly Walker,USA,Andrew Stanton,English,John Carter,http://www.imdb.com/title/tt0401729/?ref_=fn_t...,Color,0,...,25,25,25,25,25,216,264,4249,4319,5544
