In [34]:
import pandas as pd
import numpy as np
from pprint import pprint

In [35]:
# Importando dados
movies_df = pd.read_csv('../filmes.csv')

In [36]:
# Visualizando colunas do dataset
variables = list(movies_df.columns.values)
variables_str = '\n'.join(sorted(variables))
print('Variáveis:\n{}'.format(variables_str))
movies_df.head()

Variáveis:
actor_1_facebook_likes
actor_1_name
actor_2_facebook_likes
actor_2_name
actor_3_facebook_likes
actor_3_name
aspect_ratio
budget
cast_total_facebook_likes
color
content_rating
country
director_facebook_likes
director_name
duration
facenumber_in_poster
genres
gross
imdb_score
language
movie_facebook_likes
movie_imdb_link
movie_title
num_critic_for_reviews
num_user_for_reviews
num_voted_users
plot_keywords
title_year


Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


## 1. Deletando entradas com 1/3 dos valores ausentes

In [37]:
from math import ceil

num_variables = len(variables)
threshold = ceil((1/3)*num_variables)
print("Número de variáveis: {}. Limite de {} valores ausentes".format(num_variables, threshold))

# 1. Substituir 0's por NaN's
movies_df = movies_df.applymap(lambda x: np.nan if x == 0.0 else x)
size = len(movies_df)
# 2. Contabilizar NaNs para cada entrada
count_missing = movies_df.isnull().sum(axis='columns')
# 3. Remover
movies_df = movies_df[count_missing < threshold]
print("{} de {} entradas removidas".format(size-len(movies_df), size))

Número de variáveis: 28. Limite de 10 valores ausentes
16 de 5043 entradas removidas


## 2.0 Separando variáveis categóricas das variáveis numéricas

In [114]:
categorical_variables = ['actor_1_name', 'actor_2_name', 'actor_3_name', 'country', 'director_name', 'genres', 'language', 'movie_title', 'movie_imdb_link', 'plot_keywords', 'color', 'content_rating']
numerical_variables = [v for v in variables if v not in categorical_variables]

categorical_df = movies_df[categorical_variables]
numerical_df = movies_df[numerical_variables]

## 3. Processando variáveis categóricas

In [39]:
# Descrição do dataset categórico puro
categorical_df.describe()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,genres,language,movie_title,movie_imdb_link,plot_keywords,color
count,5025,5023,5016,5024,4928,5027,5018,5027,5027,4881,5011
unique,2086,3026,3518,64,2387,911,47,4901,4903,4751,2
top,Robert De Niro,Morgan Freeman,Steve Coogan,USA,Steven Spielberg,Drama,English,Victor Frankenstein,http://www.imdb.com/title/tt0077651/?ref_=fn_t...,based on novel,Color
freq,49,20,8,3796,26,236,4693,3,3,4,4802


In [40]:
# Amostra do dataset
categorical_df.head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,genres,language,movie_title,movie_imdb_link,plot_keywords,color
0,CCH Pounder,Joel David Moore,Wes Studi,USA,James Cameron,Action|Adventure|Fantasy|Sci-Fi,English,Avatar,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,avatar|future|marine|native|paraplegic,Color
1,Johnny Depp,Orlando Bloom,Jack Davenport,USA,Gore Verbinski,Action|Adventure|Fantasy,English,Pirates of the Caribbean: At World's End,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,goddess|marriage ceremony|marriage proposal|pi...,Color
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,UK,Sam Mendes,Action|Adventure|Thriller,English,Spectre,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,bomb|espionage|sequel|spy|terrorist,Color
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,USA,Christopher Nolan,Action|Thriller,English,The Dark Knight Rises,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,deception|imprisonment|lawlessness|police offi...,Color
5,Daryl Sabara,Samantha Morton,Polly Walker,USA,Andrew Stanton,Action|Adventure|Sci-Fi,English,John Carter,http://www.imdb.com/title/tt0401729/?ref_=fn_t...,alien|american civil war|male nipple|mars|prin...,Color


### 3.0 Uniformizando o texto

In [41]:
# Tirando todos os espaços em branco antes e depois das palavras
categorical_df = categorical_df.applymap(lambda x: x.strip() if type(x) is str else x)

### 3.1 Tratando valores ausentes
Estratégias:
1. Nomes dos atores: Deixar ausente
2. País: 
3. Diretor
4. Idioma:
5. Título: deixar ausente
6. Keywords: deixar ausente
8. Link do imdb: deixar ausente
9. Cor: colorido se número de likes no fb != nan

#### Atores

In [42]:
missing = categorical_df[categorical_df['actor_1_name'].isna() & categorical_df['actor_2_name'].isna() | categorical_df['actor_3_name'].isna()]
missing.describe()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,genres,language,movie_title,movie_imdb_link,plot_keywords,color
count,9,7,0.0,11,11,11,11,11,11,9,11
unique,9,7,0.0,7,11,7,2,11,11,9,1
top,Leopold Stokowski,Beatrice Ntuba,,USA,Davis Guggenheim,Documentary,English,Ayurveda: Art of Being,http://www.imdb.com/title/tt2035599/?ref_=fn_t...,3d in title|animal in title|digit in title|dol...,Color
freq,1,1,,5,1,5,10,1,1,1,11


#### País

In [60]:
country_df = categorical_df.country;
country_df.describe()

count     5024
unique      64
top        USA
freq      3796
Name: country, dtype: object

In [63]:
print("Tabela de Frequências")
country_df.value_counts().head()

Tabela de Frequências


USA        3796
UK          447
France      154
Canada      126
Germany      97
Name: country, dtype: int64

In [65]:
## Valores Ausentes
categorical_df[categorical_df['country'].isna()]

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,genres,language,movie_title,movie_imdb_link,plot_keywords,color
2370,Nicole 'Snooki' Polizzi,Jenni 'Jwoww' Farley,Paul 'Pauly D' DelVecchio,,,Comedy|Drama|Reality-TV|Romance,English,"Gone, Baby, Gone",http://www.imdb.com/title/tt1697237/?ref_=fn_t...,lifestyle,Color
3397,Dominic Cooper,Joseph Gilgun,Ruth Negga,,,Adventure|Drama|Fantasy|Mystery,English,Preacher,http://www.imdb.com/title/tt5016504/?ref_=fn_t...,heaven and hell|preacher|supernatural|vampire|...,Color
4021,Chris Brochu,Jeff Fahey,Rita Wilson,,Daniel Petrie Jr.,Drama|Thriller,English,Dawn Patrol,http://www.imdb.com/title/tt2073661/?ref_=fn_t...,desert|held at gunpoint|marine|revenge|sex on ...,Color


In [72]:
# Ajustando todos valores ausentes manualmente, baseando-se no país
categorical_df.loc[2370, 'country'] = "USA"
categorical_df.loc[2370, 'director_name'] = "Ben Affleck"

categorical_df.loc[3397, 'country'] = "USA"
categorical_df.loc[3397, 'director_name'] = "Sam Catlin"

categorical_df.loc[4021, 'country'] = "USA"

#### Diretor

In [74]:
categorical_df[categorical_df['director_name'].isna()]
# TODO: decidir como tratar

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,genres,language,movie_title,movie_imdb_link,plot_keywords,color
177,Don Johnson,Philip Michael Thomas,John Diehl,USA,,Action|Crime|Drama|Mystery|Thriller,English,Miami Vice,http://www.imdb.com/title/tt0086759/?ref_=fn_t...,cult tv|detective|drugs|police|undercover,Color
260,George Peppard,Dirk Benedict,Dwight Schultz,USA,,Action|Adventure|Crime,English,The A-Team,http://www.imdb.com/title/tt0084967/?ref_=fn_t...,1980s|cult tv|famous opening theme|good versus...,Color
404,Caroline Dhavernas,Scott Thompson,Hettienne Park,USA,,Crime|Drama|Horror|Mystery|Thriller,English,Hannibal,http://www.imdb.com/title/tt2243973/?ref_=fn_t...,blood|cannibalism|fbi|manipulation|psychiatrist,Color
459,Elden Henson,Royce Johnson,Charlie Cox,USA,,Action|Adventure|Crime|Drama|Sci-Fi|Thriller,English,Daredevil,http://www.imdb.com/title/tt3322312/?ref_=fn_t...,corruption|lawyer|partnership|superhero|vigilante,
479,Elizabeth Montgomery,Agnes Moorehead,Dick York,USA,,Comedy|Family|Fantasy,English,Bewitched,http://www.imdb.com/title/tt0057733/?ref_=fn_t...,connecticut|magic|marriage|witch|witchcraft,Black and White
537,Harold Perrineau,Matt Ryan,Charles Halford,USA,,Drama|Fantasy|Horror|Thriller,English,Constantine,http://www.imdb.com/title/tt3489184/?ref_=fn_t...,based on comic|based on comic book|dc arrowver...,Color
543,Adam Arkin,Brent Sexton,Damian Lewis,USA,,Crime|Drama|Mystery,English,Life,http://www.imdb.com/title/tt0874936/?ref_=fn_t...,cop|murder|partner|police|protective male,Color
593,Nicole Beharie,Katia Winter,Lyndie Greenwood,USA,,Adventure|Drama|Fantasy|Mystery|Thriller,English,Sleepy Hollow,http://www.imdb.com/title/tt2647544/?ref_=fn_t...,apocalypse|death|husband wife relationship|mot...,Color
645,Hector Elizondo,Kaitlyn Dever,Nancy Travis,USA,,Comedy,English,Last Man Standing,http://www.imdb.com/title/tt1828327/?ref_=fn_t...,family relationships|husband wife relationship...,Color
685,Jason Flemyng,James Nesbitt,Frances O'Connor,UK,,Crime|Drama|Mystery,English,The Missing,http://www.imdb.com/title/tt3877200/?ref_=fn_t...,france|journalist|limp|police detective|reporter,Color


#### Gêneros e keywords (ambos OK)

In [102]:
genres = categorical_df['genres'].astype(str)
genres[genres=='nan']

Series([], Name: genres, dtype: object)

#### Idioma

In [106]:
categorical_df['language'].value_counts().head()

English     4693
French        73
Spanish       40
Hindi         28
Mandarin      26
Name: language, dtype: int64

In [108]:
categorical_df.loc[categorical_df['language'].isna()]

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,genres,language,movie_title,movie_imdb_link,plot_keywords,color
3086,Jon Gries,Taylor Handley,Trent Ford,USA,Christopher Cain,Drama|History|Romance|Western,,September Dawn,http://www.imdb.com/title/tt0473700/?ref_=fn_t...,massacre|mormon|settler|utah|wagon train,Color
3539,Debi Derryberry,Kate Higgins,Cindy Robinson,USA,Richard Rich,Action|Adventure|Animation|Comedy|Drama|Family...,,Alpha and Omega 4: The Legend of the Saw Tooth...,http://www.imdb.com/title/tt4061848/?ref_=fn_t...,blindness|cave|spirit|wolf|wolf cub,
3869,Sid Caesar,Dom DeLuise,Bernadette Peters,USA,Mel Brooks,Comedy|Romance,,Silent Movie,http://www.imdb.com/title/tt0075222/?ref_=fn_t...,black comedy|friend|modern silent movie|silent...,Color
4110,William Morgan Sheppard,Kevin Gage,Brianna Brown,USA,Michael Landon Jr.,Drama|Family|Western,,Love's Abiding Joy,http://www.imdb.com/title/tt0785025/?ref_=fn_t...,19th century|faith|mayor|ranch|sheriff,Color
4409,Matthew Ziff,T.J. Storm,Sam Medina,USA,John Stockwell,Action,,Kickboxer: Vengeance,http://www.imdb.com/title/tt3082898/?ref_=fn_t...,,
4630,Justin Baldoni,Luke Perry,Leonor Varela,USA,Jonathan Meyers,Drama,,A Fine Step,http://www.imdb.com/title/tt1604100/?ref_=fn_t...,,
4810,Lillian Gish,Mae Marsh,Walter Long,USA,D.W. Griffith,Drama|History|War,,Intolerance: Love's Struggle Throughout the Ages,http://www.imdb.com/title/tt0006864/?ref_=fn_t...,huguenot|intolerance|medicis|protestant|wedding,Black and White
4885,John Gilbert,Renée Adorée,Claire Adams,USA,King Vidor,Drama|Romance|War,,The Big Parade,http://www.imdb.com/title/tt0015624/?ref_=fn_t...,chewing gum|climbing a tree|france|translation...,Black and White
4958,Stephen Carr,Johnnie Walker,Mary Carr,USA,Harry F. Millarde,Crime|Drama,,Over the Hill to the Poorhouse,http://www.imdb.com/title/tt0011549/?ref_=fn_t...,family relationships|gang|idler|poorhouse|thief,Black and White


In [113]:
#Todos os que faltam são americanos, logo são em inglês
categorical_df.loc[categorical_df['language'].isna(), 'language'] = 'English'

#### Classificação Indicativa

In [115]:
categorical_df.loc[categorical_df['content_rating'].isna()]

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,genres,language,movie_title,movie_imdb_link,plot_keywords,color,content_rating
98,Mark Chinnery,Shin'ya Tsukamoto,Atsuko Maeda,Japan,Hideaki Anno,Action|Adventure|Drama|Horror|Sci-Fi,Japanese,Godzilla Resurgence,http://www.imdb.com/title/tt4262980/?ref_=fn_t...,blood|godzilla|monster|sequel,Color,
199,Rupert Grint,Dave Legeno,Ralph Ineson,UK,Matt Birch,Action|Fantasy,English,Harry Potter and the Deathly Hallows: Part II,http://www.imdb.com/title/tt1680310/?ref_=fn_t...,,Color,
204,Mark Chinnery,Shin'ya Tsukamoto,Atsuko Maeda,Japan,Hideaki Anno,Action|Adventure|Drama|Horror|Sci-Fi,Japanese,Godzilla Resurgence,http://www.imdb.com/title/tt4262980/?ref_=fn_t...,blood|godzilla|monster|sequel,Color,
206,Rupert Grint,Toby Jones,Alfred Enoch,UK,Matt Birch,Fantasy,English,Harry Potter and the Deathly Hallows: Part I,http://www.imdb.com/title/tt1571403/?ref_=fn_t...,,Color,
242,Alain Delon,Santiago Segura,Vanessa Hessler,France,Frédéric Forestier,Adventure|Comedy|Family|Fantasy,French,Asterix at the Olympic Games,http://www.imdb.com/title/tt0463872/?ref_=fn_t...,1st century b.c.|lightsaber|local blockbuster|...,Color,
489,Nissim Renard,Roxane Duran,Julie-Marie Parmentier,France,Lucile Hadzihalilovic,Drama|Horror|Mystery|Sci-Fi,French,Evolution,http://www.imdb.com/title/tt4291590/?ref_=fn_t...,boy|giving birth|nurse|sea|ultrasonography,Color,
543,Adam Arkin,Brent Sexton,Damian Lewis,USA,,Crime|Drama|Mystery,English,Life,http://www.imdb.com/title/tt0874936/?ref_=fn_t...,cop|murder|partner|police|protective male,Color,
685,Jason Flemyng,James Nesbitt,Frances O'Connor,UK,,Crime|Drama|Mystery,English,The Missing,http://www.imdb.com/title/tt3877200/?ref_=fn_t...,france|journalist|limp|police detective|reporter,Color,
810,Li Gong,Aaron Kwok,Eddie Peng,China,Pou-Soi Cheang,Action|Adventure|Fantasy,English,Xi you ji zhi: Sun Wukong san da Baigu Jing,http://www.imdb.com/title/tt4591310/?ref_=fn_t...,buddhism|demon|journey to the west|monk|monkey...,Color,
833,Barry Corbin,Noureen DeWulf,Brian Austin Green,USA,,Comedy|Romance,English,Anger Management,http://www.imdb.com/title/tt1986770/?ref_=fn_t...,anger management|argument|irony|sarcasm|therapist,Color,


In [165]:
rating_freq = categorical_df['content_rating'].value_counts()
rating_labels = rating_freq.index
rating_freqs = rating_freq.values
normalization_factor = sum(rating_freqs) #normalizando
weigths = [i/normalization_factor for i in rating_freqs]
rating_freq

R            2118
PG-13        1460
PG            701
Not Rated     116
G             112
Unrated        62
Approved       55
TV-14          30
TV-MA          20
TV-PG          13
X              13
TV-G           10
Passed          9
NC-17           7
GP              6
M               5
TV-Y            1
TV-Y7           1
Name: content_rating, dtype: int64

In [171]:
# Estratégia: Atribuir a classificação mais frequente de cada diretor
from numpy.random import choice
missing = categorical_df.loc[categorical_df['content_rating'].isna()]
for index, row in missing.iterrows():
    ratings = categorical_df.loc[categorical_df['director_name'] == row['director_name'], 'content_rating'].value_counts()
    if len(ratings)> 0:
        rating = ratings.index[0]
    else:
        rating = choice(rating_labels, p=weigths) # se não achar realiza sorteio poderado com frequência
    categorical_df.loc[index, 'content_rating'] = rating

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,genres,language,movie_title,movie_imdb_link,plot_keywords,color,content_rating


#### Cor

In [47]:
# Visualização dos dados antes de tratar dados ausentes
color_df = categorical_df['color']
print("Frequências:\n{}".format(color_df.astype(str).value_counts()))

Frequências:
Color              4802
Black and White     209
nan                  16
Name: color, dtype: int64


In [11]:
#Tratando dados ausentes
is_color = numerical_df['movie_facebook_likes'] > 0

# Não tem valor e é colorido
color_df = categorical_df[is_color & categorical_df['color'].isna()]
color_df['color'] = 'Color'
categorical_df.update(color_df)

# Não tem valor e é preto e branco
color_df = categorical_df[~is_color & categorical_df['color'].isna()]
color_df['color'] = 'Black and White'

# Atualizando as entradas do dataframe
categorical_df.update(color_df)
categorical_df['color'].describe()

print("Valores presentes: {}".format(set(categorical_df['color'].values)))

Valores presentes: {'Color', 'Black and White'}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [12]:
# TODO: fazer com o resto das variáveis


### 3.2 Codificando colunas em formato de lista (gênero e keywords)

In [13]:
#Bibliotecas
from sklearn.preprocessing import LabelEncoder

#### Gêneros

In [14]:
#Processando gênero
# Transformando string em lista
genres =  categorical_df.genres.apply(lambda x: x.split('|'))
genres.head()

0    [Action, Adventure, Fantasy, Sci-Fi]
1            [Action, Adventure, Fantasy]
2           [Action, Adventure, Thriller]
3                      [Action, Thriller]
5             [Action, Adventure, Sci-Fi]
Name: genres, dtype: object

In [15]:
# Quebrando o array em colunas
genres = genres.apply(pd.Series)
genres.rename(lambda x: 'genre_{}'.format(x), axis='columns', inplace=True)
genres = genres.astype(str) # as que não tem valor ficam como nan
genres.head()

Unnamed: 0,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7
0,Action,Adventure,Fantasy,Sci-Fi,,,,
1,Action,Adventure,Fantasy,,,,,
2,Action,Adventure,Thriller,,,,,
3,Action,Thriller,,,,,,
5,Action,Adventure,Sci-Fi,,,,,


In [16]:
#Convertendo as categorias para inteiros
all_genres = set()
genres.applymap(lambda x: all_genres.add(x))
genre_encoder = LabelEncoder()
genre_encoder.fit(list(all_genres))
genres = genres.apply(genre_encoder.transform)
genres.head()

Unnamed: 0,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7
0,0,1,9,19,25,25,25,25
1,0,1,9,25,25,25,25,25
2,0,1,22,25,25,25,25,25
3,0,22,25,25,25,25,25,25
5,0,1,19,25,25,25,25,25


In [17]:
# Colocando de volta no Dataframe
categorical_df.drop('genres', axis='columns', inplace=True)
categorical_df = categorical_df.join(genres)
categorical_df.head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,language,movie_title,movie_imdb_link,plot_keywords,color,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7
0,CCH Pounder,Joel David Moore,Wes Studi,USA,James Cameron,English,Avatar,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,avatar|future|marine|native|paraplegic,Color,0,1,9,19,25,25,25,25
1,Johnny Depp,Orlando Bloom,Jack Davenport,USA,Gore Verbinski,English,Pirates of the Caribbean: At World's End,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,goddess|marriage ceremony|marriage proposal|pi...,Color,0,1,9,25,25,25,25,25
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,UK,Sam Mendes,English,Spectre,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,bomb|espionage|sequel|spy|terrorist,Color,0,1,22,25,25,25,25,25
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,USA,Christopher Nolan,English,The Dark Knight Rises,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,deception|imprisonment|lawlessness|police offi...,Color,0,22,25,25,25,25,25,25
5,Daryl Sabara,Samantha Morton,Polly Walker,USA,Andrew Stanton,English,John Carter,http://www.imdb.com/title/tt0401729/?ref_=fn_t...,alien|american civil war|male nipple|mars|prin...,Color,0,1,19,25,25,25,25,25


#### Keywords

In [18]:
# Processando keywords
keywords = categorical_df.plot_keywords.apply(lambda x: x.split('|') if type(x) is str else 'None')
keywords.head()

0         [avatar, future, marine, native, paraplegic]
1    [goddess, marriage ceremony, marriage proposal...
2            [bomb, espionage, sequel, spy, terrorist]
3    [deception, imprisonment, lawlessness, police ...
5    [alien, american civil war, male nipple, mars,...
Name: plot_keywords, dtype: object

In [19]:
# Quebrando o array em colunas
keywords = keywords.apply(pd.Series)
keywords.rename(lambda x: 'keyword_{}'.format(x), axis='columns', inplace=True)
keywords = keywords.astype(str) # nan is the value for missing

In [20]:
#Convertendo as categorias para inteiros
all_keywords = set()
keywords.applymap(lambda x: all_keywords.add(x))
keywords_encoder = LabelEncoder()
keywords_encoder.fit(list(all_keywords))
keywords = keywords.apply(keywords_encoder.transform)
keywords.head()

Unnamed: 0,keyword_0,keyword_1,keyword_2,keyword_3,keyword_4
0,515,2866,4301,4759,5158
1,3026,4314,4317,5313,6522
2,875,2327,6334,6785,7202
3,1894,3565,3964,5405,7206
5,216,264,4249,4319,5544


In [21]:
# Colocando de volta no Dataframe
categorical_df.drop('plot_keywords', axis='columns', inplace=True)
categorical_df = categorical_df.join(keywords)
categorical_df.head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,language,movie_title,movie_imdb_link,color,genre_0,...,genre_3,genre_4,genre_5,genre_6,genre_7,keyword_0,keyword_1,keyword_2,keyword_3,keyword_4
0,CCH Pounder,Joel David Moore,Wes Studi,USA,James Cameron,English,Avatar,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,Color,0,...,19,25,25,25,25,515,2866,4301,4759,5158
1,Johnny Depp,Orlando Bloom,Jack Davenport,USA,Gore Verbinski,English,Pirates of the Caribbean: At World's End,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,Color,0,...,25,25,25,25,25,3026,4314,4317,5313,6522
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,UK,Sam Mendes,English,Spectre,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,Color,0,...,25,25,25,25,25,875,2327,6334,6785,7202
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,USA,Christopher Nolan,English,The Dark Knight Rises,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,Color,0,...,25,25,25,25,25,1894,3565,3964,5405,7206
5,Daryl Sabara,Samantha Morton,Polly Walker,USA,Andrew Stanton,English,John Carter,http://www.imdb.com/title/tt0401729/?ref_=fn_t...,Color,0,...,25,25,25,25,25,216,264,4249,4319,5544
