In [1]:
import pandas as pd
import numpy as np

In [2]:
# Importando dados
movies_df = pd.read_csv('../filmes.csv')
# Visão geral das colunas e dados ausentes
movies_df.isnull().sum()

color                         19
director_name                104
num_critic_for_reviews        50
duration                      15
director_facebook_likes      104
actor_3_facebook_likes        23
actor_2_name                  13
actor_1_facebook_likes         7
gross                        884
genres                         0
actor_1_name                   7
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                  23
facenumber_in_poster          13
plot_keywords                153
movie_imdb_link                0
num_user_for_reviews          21
language                      12
country                        5
content_rating               303
budget                       492
title_year                   108
actor_2_facebook_likes        13
imdb_score                     0
aspect_ratio                 329
movie_facebook_likes           0
dtype: int64

## 1. Deletando entradas com 25% dos valores ausentes

In [3]:
num_variables = len(movies_df.columns)
threshold = 0.25*num_variables
print("Número de variáveis: {}. Limite de {} valores ausentes".format(num_variables, threshold))

# 1. Substituir 0's por NaN's
movies_df = movies_df.applymap(lambda x: np.nan if x == 0.0 else x)
size = len(movies_df)
# 2. Contabilizar valores ausentes por linha
count_missing = movies_df.isnull().sum(axis='columns')
# 3. Remover
movies_df = movies_df[count_missing < threshold]
print("{} de {} entradas removidas".format(size-len(movies_df), size))

Número de variáveis: 28. Limite de 7.0 valores ausentes
92 de 5043 entradas removidas


## 2.0 Separando variáveis categóricas das variáveis numéricas

In [4]:
categorical_variables = ['actor_1_name', 'actor_2_name', 'actor_3_name', 'country', 'director_name', 'genres', 'language', 'movie_title', 'movie_imdb_link', 'plot_keywords', 'color', 'content_rating']
numerical_variables = [v for v in movies_df.columns if v not in categorical_variables]

categorical_df = movies_df[categorical_variables]
numerical_df = movies_df[numerical_variables]

## 3. Processando variáveis categóricas

In [5]:
# Descrição do dataset categórico puro
categorical_df.describe()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,genres,language,movie_title,movie_imdb_link,plot_keywords,color,content_rating
count,4951,4951,4945,4950,4889,4951,4944,4951,4951,4838,4939,4715
unique,2039,2969,3465,61,2357,902,45,4825,4827,4709,2,18
top,Robert De Niro,Morgan Freeman,John Heard,USA,Steven Spielberg,Drama,English,Halloween,http://www.imdb.com/title/tt2224026/?ref_=fn_t...,animal name in title|ape abducts a woman|goril...,Color,R
freq,49,20,8,3757,26,231,4630,3,3,3,4731,2117


In [6]:
# Amostra do dataset
categorical_df.head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,genres,language,movie_title,movie_imdb_link,plot_keywords,color,content_rating
0,CCH Pounder,Joel David Moore,Wes Studi,USA,James Cameron,Action|Adventure|Fantasy|Sci-Fi,English,Avatar,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,avatar|future|marine|native|paraplegic,Color,PG-13
1,Johnny Depp,Orlando Bloom,Jack Davenport,USA,Gore Verbinski,Action|Adventure|Fantasy,English,Pirates of the Caribbean: At World's End,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,goddess|marriage ceremony|marriage proposal|pi...,Color,PG-13
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,UK,Sam Mendes,Action|Adventure|Thriller,English,Spectre,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,bomb|espionage|sequel|spy|terrorist,Color,PG-13
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,USA,Christopher Nolan,Action|Thriller,English,The Dark Knight Rises,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,deception|imprisonment|lawlessness|police offi...,Color,PG-13
5,Daryl Sabara,Samantha Morton,Polly Walker,USA,Andrew Stanton,Action|Adventure|Sci-Fi,English,John Carter,http://www.imdb.com/title/tt0401729/?ref_=fn_t...,alien|american civil war|male nipple|mars|prin...,Color,PG-13


### 3.0 Uniformizando o texto

In [7]:
# Tirando todos os espaços em branco antes e depois das palavras
categorical_df = categorical_df.applymap(lambda x: x.strip() if type(x) is str else x)

### 3.1 Tratando valores ausentes
Estratégias:
1. Nomes dos atores: Deixar ausente
2. País: 
3. Diretor
4. Idioma:
5. Título: deixar ausente
6. Keywords: deixar ausente
8. Link do imdb: deixar ausente
9. Cor: colorido se número de likes no fb != nan

#### Atores

In [8]:
missing = categorical_df[categorical_df['actor_1_name'].isna() & categorical_df['actor_2_name'].isna() | categorical_df['actor_3_name'].isna()]
missing.describe()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,genres,language,movie_title,movie_imdb_link,plot_keywords,color,content_rating
count,6,6,0.0,6,6,6,6,6,6,6,6,5
unique,6,6,0.0,3,6,5,1,6,6,6,1,4
top,Don Brooks,Casey Affleck,,USA,Davis Guggenheim,Documentary,English,Gerry,http://www.imdb.com/title/tt0302674/?ref_=fn_t...,climate|earth|global warming|science|truth,Color,G
freq,1,1,,4,1,2,6,1,1,1,6,2


#### País

In [9]:
country_df = categorical_df.country;
country_df.describe()

count     4950
unique      61
top        USA
freq      3757
Name: country, dtype: object

In [10]:
print("Tabela de Frequências")
country_df.value_counts().head()

Tabela de Frequências


USA        3757
UK          433
France      153
Canada      122
Germany      96
Name: country, dtype: int64

In [11]:
## Valores Ausentes
categorical_df[categorical_df['country'].isna()]

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,genres,language,movie_title,movie_imdb_link,plot_keywords,color,content_rating
4021,Chris Brochu,Jeff Fahey,Rita Wilson,,Daniel Petrie Jr.,Drama|Thriller,English,Dawn Patrol,http://www.imdb.com/title/tt2073661/?ref_=fn_t...,desert|held at gunpoint|marine|revenge|sex on ...,Color,


In [12]:
# Ajustando todos valores ausentes manualmente, baseando-se no país
categorical_df.loc[2370, 'country'] = "USA"
categorical_df.loc[2370, 'director_name'] = "Ben Affleck"

categorical_df.loc[3397, 'country'] = "USA"
categorical_df.loc[3397, 'director_name'] = "Sam Catlin"

categorical_df.loc[4021, 'country'] = "USA"

#### Diretor

In [13]:
categorical_df.loc[categorical_df['director_name'].isna()].describe()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,genres,language,movie_title,movie_imdb_link,plot_keywords,color,content_rating
count,62,62,62,62,0.0,62,62,62,62,62,62,53
unique,58,60,59,6,0.0,50,3,60,60,60,2,7
top,Krystyna Janda,Olaf Lubaszenko,Tom Kane,USA,,Comedy,English,Saving Grace,http://www.imdb.com/title/tt0092337/?ref_=fn_t...,catholic|death row|inmate|oklahoma city|sexual...,Color,TV-14
freq,2,2,2,51,,4,59,2,2,2,59,22


In [14]:
# Estratégia adotada: descartar
categorical_df = categorical_df.loc[~categorical_df['director_name'].isna()]

#### Gêneros

In [15]:
categorical_df['genres'].describe()

count      4889
unique      893
top       Drama
freq        229
Name: genres, dtype: object

In [16]:
categorical_df[categorical_df['genres'].isna()].describe()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,genres,language,movie_title,movie_imdb_link,plot_keywords,color,content_rating
count,0.0,0.0,0.0,2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
unique,0.0,0.0,0.0,1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
top,,,,USA,Sam Catlin,,,,,,,
freq,,,,2,1,,,,,,,


#### Keywords

In [17]:
categorical_df['plot_keywords'].describe()

count                                                  4776
unique                                                 4649
top       animal name in title|ape abducts a woman|goril...
freq                                                      3
Name: plot_keywords, dtype: object

In [18]:
categorical_df[categorical_df['plot_keywords'].isna()].describe() #Estratégia: deixar sem

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,genres,language,movie_title,movie_imdb_link,plot_keywords,color,content_rating
count,113,113,113,115,115,113,113,113,113,0.0,112,67
unique,112,112,112,17,113,61,11,112,112,0.0,2,7
top,Demi Moore,Landon Liboiron,Michael Wincott,USA,Joel Paul Reisig,Drama,English,Forsaken,http://www.imdb.com/title/tt2271563/?ref_=fn_t...,,Color,R
freq,2,2,2,72,2,16,95,2,2,,109,29


#### Idioma

In [19]:
categorical_df['language'].value_counts().head()

English     4571
French        70
Spanish       40
Hindi         28
Mandarin      26
Name: language, dtype: int64

In [20]:
categorical_df.loc[categorical_df['language'].isna()]

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,genres,language,movie_title,movie_imdb_link,plot_keywords,color,content_rating
3086,Jon Gries,Taylor Handley,Trent Ford,USA,Christopher Cain,Drama|History|Romance|Western,,September Dawn,http://www.imdb.com/title/tt0473700/?ref_=fn_t...,massacre|mormon|settler|utah|wagon train,Color,R
3539,Debi Derryberry,Kate Higgins,Cindy Robinson,USA,Richard Rich,Action|Adventure|Animation|Comedy|Drama|Family...,,Alpha and Omega 4: The Legend of the Saw Tooth...,http://www.imdb.com/title/tt4061848/?ref_=fn_t...,blindness|cave|spirit|wolf|wolf cub,,
3869,Sid Caesar,Dom DeLuise,Bernadette Peters,USA,Mel Brooks,Comedy|Romance,,Silent Movie,http://www.imdb.com/title/tt0075222/?ref_=fn_t...,black comedy|friend|modern silent movie|silent...,Color,PG
4110,William Morgan Sheppard,Kevin Gage,Brianna Brown,USA,Michael Landon Jr.,Drama|Family|Western,,Love's Abiding Joy,http://www.imdb.com/title/tt0785025/?ref_=fn_t...,19th century|faith|mayor|ranch|sheriff,Color,PG
4810,Lillian Gish,Mae Marsh,Walter Long,USA,D.W. Griffith,Drama|History|War,,Intolerance: Love's Struggle Throughout the Ages,http://www.imdb.com/title/tt0006864/?ref_=fn_t...,huguenot|intolerance|medicis|protestant|wedding,Black and White,Not Rated
4885,John Gilbert,Renée Adorée,Claire Adams,USA,King Vidor,Drama|Romance|War,,The Big Parade,http://www.imdb.com/title/tt0015624/?ref_=fn_t...,chewing gum|climbing a tree|france|translation...,Black and White,Not Rated
4958,Stephen Carr,Johnnie Walker,Mary Carr,USA,Harry F. Millarde,Crime|Drama,,Over the Hill to the Poorhouse,http://www.imdb.com/title/tt0011549/?ref_=fn_t...,family relationships|gang|idler|poorhouse|thief,Black and White,
2370,,,,USA,Ben Affleck,,,,,,,
3397,,,,USA,Sam Catlin,,,,,,,


In [21]:
#Todos os que faltam são americanos, logo são em inglês
categorical_df.loc[categorical_df['language'].isna(), 'language'] = 'English'

#### Classificação Indicativa

In [22]:
categorical_df.loc[categorical_df['content_rating'].isna()].describe()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,genres,language,movie_title,movie_imdb_link,plot_keywords,color,content_rating
count,227,227,226,229,229,227,229,227,227,181,225,0.0
unique,220,223,223,28,223,111,22,225,225,179,2,0.0
top,Konstantin Khabenskiy,Shin'ya Tsukamoto,Riteish Deshmukh,USA,Jon Knautz,Drama,English,Godzilla Resurgence,http://www.imdb.com/title/tt0796314/?ref_=fn_t...,blood|godzilla|monster|sequel,Color,
freq,2,2,2,115,2,19,156,2,2,2,211,


In [23]:
# Estratégia: Deixar como está

#### Cor

In [24]:
# Visualização dos dados antes de tratar dados ausentes
categorical_df['color'].astype(str).value_counts()

Color              4672
Black and White     205
nan                  14
Name: color, dtype: int64

In [25]:
#Estratégia: remover as que faltam, dado que são poucas
categorical_df = categorical_df[~categorical_df['color'].isna()]

### 3.2 Codificando colunas simples

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
import string

def remove_special_chars(x, exceptions = ['|']):
    x = x.replace(' ', '').lower()
    for char in string.punctuation:
        if char not in exceptions:
            x = x.replace(char, '')
    return x
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [27]:
#Tratando dados ausentes
is_color = numerical_df['movie_facebook_likes'] > 0

# Não tem valor e é colorido
color_df = categorical_df[is_color & categorical_df['color'].isna()]
color_df['color'] = 'Color'
categorical_df.update(color_df)

# Não tem valor e é preto e branco
color_df = categorical_df[~is_color & categorical_df['color'].isna()]
color_df['color'] = 'Black and White'

# Atualizando as entradas do dataframe
categorical_df.update(color_df)
categorical_df['color'].describe()

print("Valores presentes: {}".format(set(categorical_df['color'].values)))

Valores presentes: {'Color', 'Black and White'}


  """
  # Remove the CWD from sys.path while we load stuff.


#### Atores

In [28]:
columns_actors = ['actor_1_name', 'actor_2_name', 'actor_3_name']
categorical_df[columns_actors] = categorical_df[columns_actors].astype(str).applymap(remove_special_chars)
actors = categorical_df[columns_actors].values.flatten()

In [29]:
actor_vectorizer = CountVectorizer()
actor_vectorizer.fit(actors)
vocabulary = sorted(actor_vectorizer.vocabulary_)

In [30]:
join_actors = categorical_df[columns_actors[0]].astype(str) +'|' +  categorical_df[columns_actors[1]].astype(str) + '|' +categorical_df[columns_actors[2]].astype(str)
categorical_df.drop(columns = columns_actors, inplace=True)
join_actors.head()

0            cchpounder|joeldavidmoore|wesstudi
1         johnnydepp|orlandobloom|jackdavenport
2    christophwaltz|rorykinnear|stephaniesigman
3     tomhardy|christianbale|josephgordonlevitt
5        darylsabara|samanthamorton|pollywalker
dtype: object

In [31]:
#actors_vect =  actor_vectorizer.transform(categorical_df['actors']).toarray()
#categorical_df['actors'] = actors_vect
vect_actors = actor_vectorizer.transform(join_actors.values)
actors_df =pd.DataFrame(vect_actors.toarray())
actors_df.rename(lambda x: 'actor_'+vocabulary[int(x)], axis='columns', inplace=True)
categorical_df = categorical_df.join(actors_df)
categorical_df.head()

Unnamed: 0,country,director_name,genres,language,movie_title,movie_imdb_link,plot_keywords,color,content_rating,actor_50cent,...,actor_zooeydeschanel,actor_zoëbell,actor_zoëkravitz,actor_zoëpoledouris,actor_zubaidasahar,actor_zuhairhaddad,actor_álexangulo,actor_ángelamolina,actor_émiliedequenne,actor_óscarjaenada
0,USA,James Cameron,Action|Adventure|Fantasy|Sci-Fi,English,Avatar,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,avatar|future|marine|native|paraplegic,Color,PG-13,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,USA,Gore Verbinski,Action|Adventure|Fantasy,English,Pirates of the Caribbean: At World's End,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,goddess|marriage ceremony|marriage proposal|pi...,Color,PG-13,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,UK,Sam Mendes,Action|Adventure|Thriller,English,Spectre,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,bomb|espionage|sequel|spy|terrorist,Color,PG-13,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,USA,Christopher Nolan,Action|Thriller,English,The Dark Knight Rises,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,deception|imprisonment|lawlessness|police offi...,Color,PG-13,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,USA,Andrew Stanton,Action|Adventure|Sci-Fi,English,John Carter,http://www.imdb.com/title/tt0401729/?ref_=fn_t...,alien|american civil war|male nipple|mars|prin...,Color,PG-13,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### País, diretor, idioma, Título, Cor, Classificação Indicativa, Título e Link

In [32]:
categorical_df.drop(columns=['movie_title', 'movie_imdb_link'], inplace=True) # não importam
variables = ['country', 'director_name', 'language', 'color', 'content_rating']
dummies = pd.get_dummies(categorical_df[variables])
categorical_df.drop(columns=variables, inplace=True)
categorical_df = categorical_df.join(dummies)
categorical_df.head()

Unnamed: 0,genres,plot_keywords,actor_50cent,actor_aaliyah,actor_aaronashmore,actor_aaronhughes,actor_aaronkwok,actor_aaronstanford,actor_aaronstaton,actor_aaronyoo,...,content_rating_Not Rated,content_rating_PG,content_rating_PG-13,content_rating_Passed,content_rating_R,content_rating_TV-14,content_rating_TV-G,content_rating_TV-PG,content_rating_Unrated,content_rating_X
0,Action|Adventure|Fantasy|Sci-Fi,avatar|future|marine|native|paraplegic,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,0
1,Action|Adventure|Fantasy,goddess|marriage ceremony|marriage proposal|pi...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,0
2,Action|Adventure|Thriller,bomb|espionage|sequel|spy|terrorist,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,0
3,Action|Thriller,deception|imprisonment|lawlessness|police offi...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,0
5,Action|Adventure|Sci-Fi,alien|american civil war|male nipple|mars|prin...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,0


### 3.3 Codificando colunas em formato de lista (gênero e keywords)

#### Gêneros

In [33]:
#Processando gênero
# Transformando string em lista
genres =  categorical_df.genres.apply(remove_special_chars)
genres.head()

0    action|adventure|fantasy|scifi
1          action|adventure|fantasy
2         action|adventure|thriller
3                   action|thriller
5            action|adventure|scifi
Name: genres, dtype: object

In [34]:
vocabulary = set()
genres.apply(lambda x: vocabulary.update(x.split('|')))

genre_vectorizer = CountVectorizer()
genre_vectorizer.fit(vocabulary)
vocabulary = sorted(genre_vectorizer.vocabulary_)
vocabulary

['action',
 'adventure',
 'animation',
 'biography',
 'comedy',
 'crime',
 'documentary',
 'drama',
 'family',
 'fantasy',
 'filmnoir',
 'history',
 'horror',
 'music',
 'musical',
 'mystery',
 'news',
 'romance',
 'scifi',
 'short',
 'sport',
 'thriller',
 'war',
 'western']

In [35]:
vect_genres = genre_vectorizer.transform(genres.values)
genres_binary_df = pd.DataFrame(vect_genres.toarray())
genres_binary_df.rename(lambda x: 'genre_'+vocabulary[int(x)], axis='columns', inplace=True)
genres_binary_df.head()

Unnamed: 0,genre_action,genre_adventure,genre_animation,genre_biography,genre_comedy,genre_crime,genre_documentary,genre_drama,genre_family,genre_fantasy,...,genre_musical,genre_mystery,genre_news,genre_romance,genre_scifi,genre_short,genre_sport,genre_thriller,genre_war,genre_western
0,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [36]:
# Colocando de volta no Dataframe
categorical_df.drop('genres', axis='columns', inplace=True)
categorical_df = categorical_df.join(genres_binary_df)
categorical_df.head()

Unnamed: 0,plot_keywords,actor_50cent,actor_aaliyah,actor_aaronashmore,actor_aaronhughes,actor_aaronkwok,actor_aaronstanford,actor_aaronstaton,actor_aaronyoo,actor_aasheekaabathija,...,genre_musical,genre_mystery,genre_news,genre_romance,genre_scifi,genre_short,genre_sport,genre_thriller,genre_war,genre_western
0,avatar|future|marine|native|paraplegic,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,goddess|marriage ceremony|marriage proposal|pi...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,bomb|espionage|sequel|spy|terrorist,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,deception|imprisonment|lawlessness|police offi...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,alien|american civil war|male nipple|mars|prin...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Keywords

In [37]:
#Lembrete do problema de '1000000 b.c.
keywords = categorical_df.plot_keywords.astype(str).apply(lambda x: x.split('|'))
all_keywords = set()
keywords.apply(lambda x: all_keywords.update(x))
sorted(all_keywords)

['10 year old',
 '1000000 b.c.',
 '1190s',
 '12 step program',
 '12 year old',
 '12 year time span',
 '12th century',
 '13 year old',
 '13 year olds',
 '13th birthday',
 '14 year old',
 '14th century',
 '15 year old',
 '1520s',
 '15th birthday',
 '15th century',
 '16 year old',
 '16th century',
 '1770s',
 '17th century',
 '18 wheeler',
 '1800s',
 '1810s',
 '1830s',
 '1850s',
 '1860s',
 '1880s',
 '1890s',
 '18th birthday',
 '18th century',
 '1910s',
 '1920s',
 '1930s',
 '1940s',
 '1950s',
 '1955 chevrolet',
 '1959 cadillac',
 '1960s',
 '1969 dodge charger',
 '1970s',
 '1980s',
 '1988 winter olympics',
 '1990s',
 '19th century',
 '1st century',
 '1st century b.c.',
 '20 years later',
 '2000s',
 '2010s',
 '2030s',
 '20th century',
 '21 year old',
 '21st birthday',
 '21st century',
 '22 year old',
 '22nd century',
 '23 year time span',
 '23rd century',
 '27th century',
 '2nd century',
 '3 dimensional',
 '30 year old',
 '35 mm digital camera',
 '3d',
 '3d in title',
 '40 year old',
 '40th b

In [38]:
# Processando keywords
keywords = categorical_df.plot_keywords.astype(str).apply(remove_special_chars)
all_keywords = set()
keywords.apply(lambda x: all_keywords.update(x.split('|')))
len(all_keywords)

7933

In [39]:
keyword_vectorizer = CountVectorizer()
keyword_vectorizer.fit(all_keywords)
vocabulary = sorted(keyword_vectorizer.vocabulary_)
len(vocabulary)

7933

In [40]:
vect_keywords = genre_vectorizer.transform(keywords.values)
keywords_binary_df = pd.DataFrame(vect_keywords.toarray())
keywords_binary_df.rename(lambda x: 'keyword_'+vocabulary[int(x)], axis='columns', inplace=True)
keywords_binary_df.head()

Unnamed: 0,keyword_1000000bc,keyword_10yearold,keyword_1190s,keyword_12stepprogram,keyword_12thcentury,keyword_12yearold,keyword_12yeartimespan,keyword_13thbirthday,keyword_13yearold,keyword_13yearolds,...,keyword_15thcentury,keyword_15yearold,keyword_16thcentury,keyword_16yearold,keyword_1770s,keyword_17thcentury,keyword_1800s,keyword_1810s,keyword_1830s,keyword_1850s
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
# Colocando de volta no Dataframe
categorical_df.drop('plot_keywords', axis='columns', inplace=True)
categorical_df = categorical_df.join(keywords_binary_df)
categorical_df.head()

Unnamed: 0,actor_50cent,actor_aaliyah,actor_aaronashmore,actor_aaronhughes,actor_aaronkwok,actor_aaronstanford,actor_aaronstaton,actor_aaronyoo,actor_aasheekaabathija,actor_aasifmandvi,...,keyword_15thcentury,keyword_15yearold,keyword_16thcentury,keyword_16yearold,keyword_1770s,keyword_17thcentury,keyword_1800s,keyword_1810s,keyword_1830s,keyword_1850s
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 4. Processando variáveis numéricas

In [42]:
numerical_df.head()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,723.0,178.0,,855.0,1000.0,760505847.0,886204,4834.0,,3054.0,237000000.0,2009.0,936.0,7.9,1.78,33000.0
1,302.0,169.0,563.0,1000.0,40000.0,309404152.0,471220,48350.0,,1238.0,300000000.0,2007.0,5000.0,7.1,2.35,
2,602.0,148.0,,161.0,11000.0,200074175.0,275868,11700.0,1.0,994.0,245000000.0,2015.0,393.0,6.8,2.35,85000.0
3,813.0,164.0,22000.0,23000.0,27000.0,448130642.0,1144337,106759.0,,2701.0,250000000.0,2012.0,23000.0,8.5,2.35,164000.0
5,462.0,132.0,475.0,530.0,640.0,73058679.0,212204,1873.0,1.0,738.0,263700000.0,2012.0,632.0,6.6,2.35,24000.0


### 4.1 Valores Ausentes

In [43]:
#### Visão geral
pd.options.mode.chained_assignment = None ## remover dps

In [44]:
numerical_df.isnull().sum()

num_critic_for_reviews         29
duration                        7
director_facebook_likes       935
actor_3_facebook_likes         74
actor_1_facebook_likes          7
gross                         804
num_voted_users                 0
cast_total_facebook_likes       7
facenumber_in_poster         2109
num_user_for_reviews            7
budget                        435
title_year                     62
actor_2_facebook_likes         33
imdb_score                      0
aspect_ratio                  275
movie_facebook_likes         2148
dtype: int64

#### Duração

In [45]:
# Subs. pela media
numerical_df['duration'] = numerical_df['duration'].replace(np.nan, numerical_df.duration.mean())

In [46]:
# Temporário: ajustar tudo pela média:
for col in numerical_df.columns:
    numerical_df[col] = numerical_df[col].replace(np.nan, numerical_df[col].mean())

In [47]:
numerical_df.isnull().sum()

num_critic_for_reviews       0
duration                     0
director_facebook_likes      0
actor_3_facebook_likes       0
actor_1_facebook_likes       0
gross                        0
num_voted_users              0
cast_total_facebook_likes    0
facenumber_in_poster         0
num_user_for_reviews         0
budget                       0
title_year                   0
actor_2_facebook_likes       0
imdb_score                   0
aspect_ratio                 0
movie_facebook_likes         0
dtype: int64

### 4.2 Normalizando os dados

In [48]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
numerical_vals = numerical_df.values
numerical_vals = scaler.fit_transform(numerical_vals)
numerical_df = pd.DataFrame(numerical_vals, columns=numerical_df.columns, index=numerical_df.index)
numerical_df.head()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,0.889163,0.522936,0.036614,0.03709,0.001559,1.0,0.524453,0.007358,0.03326,0.603479,0.019402,0.93,0.006818,0.818182,0.040486,0.094548
1,0.37069,0.495413,0.024393,0.043395,0.062497,0.40684,0.278865,0.07362,0.03326,0.244515,0.024559,0.91,0.036482,0.714286,0.078947,0.038674
2,0.740148,0.431193,0.036614,0.006914,0.017184,0.26308,0.163256,0.017813,0.0,0.196284,0.020056,0.99,0.002854,0.675325,0.078947,0.243547
3,1.0,0.480122,0.956518,1.0,0.042185,0.589253,0.677216,0.162559,0.03326,0.533702,0.020466,0.96,0.167871,0.896104,0.078947,0.469909
5,0.567734,0.382263,0.020567,0.022959,0.000997,0.096066,0.125579,0.002849,0.0,0.145681,0.021587,0.96,0.004599,0.649351,0.078947,0.06876


## 5.0 Agrupando resultados e exportando

In [49]:
result = numerical_df.join(categorical_df)
result.head()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,...,keyword_15thcentury,keyword_15yearold,keyword_16thcentury,keyword_16yearold,keyword_1770s,keyword_17thcentury,keyword_1800s,keyword_1810s,keyword_1830s,keyword_1850s
0,0.889163,0.522936,0.036614,0.03709,0.001559,1.0,0.524453,0.007358,0.03326,0.603479,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.37069,0.495413,0.024393,0.043395,0.062497,0.40684,0.278865,0.07362,0.03326,0.244515,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.740148,0.431193,0.036614,0.006914,0.017184,0.26308,0.163256,0.017813,0.0,0.196284,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.480122,0.956518,1.0,0.042185,0.589253,0.677216,0.162559,0.03326,0.533702,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.567734,0.382263,0.020567,0.022959,0.000997,0.096066,0.125579,0.002849,0.0,0.145681,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
len(result)

4951

In [51]:
result.to_csv('../models/data.csv')