In [1]:
import pandas as pd
import numpy as np
from pprint import pprint

In [2]:
# Importando dados
movies_df = pd.read_csv('../filmes.csv')

In [3]:
# Visualizando colunas do dataset
variables = list(movies_df.columns.values)
variables_str = '\n'.join(sorted(variables))
print('Variáveis:\n{}'.format(variables_str))
movies_df.head()

Variáveis:
actor_1_facebook_likes
actor_1_name
actor_2_facebook_likes
actor_2_name
actor_3_facebook_likes
actor_3_name
aspect_ratio
budget
cast_total_facebook_likes
color
content_rating
country
director_facebook_likes
director_name
duration
facenumber_in_poster
genres
gross
imdb_score
language
movie_facebook_likes
movie_imdb_link
movie_title
num_critic_for_reviews
num_user_for_reviews
num_voted_users
plot_keywords
title_year


Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


## 1. Deletando entradas com 1/3 dos valores ausentes

In [4]:
from math import ceil

num_variables = len(variables)
threshold = ceil((1/3)*num_variables)
print("Número de variáveis: {}. Limite de {} valores ausentes".format(num_variables, threshold))

# 1. Substituir 0's por NaN's
movies_df = movies_df.applymap(lambda x: np.nan if x == 0.0 else x)
size = len(movies_df)
# 2. Contabilizar NaNs para cada entrada
count_missing = movies_df.isnull().sum(axis='columns')
# 3. Remover
movies_df = movies_df[count_missing < threshold]
print("{} de {} entradas removidas".format(size-len(movies_df), size))

Número de variáveis: 28. Limite de 10 valores ausentes
16 de 5043 entradas removidas


## 2.0 Separando variáveis categóricas das variáveis numéricas

In [5]:
categorical_variables = ['actor_1_name', 'actor_2_name', 'actor_3_name', 'country', 'director_name', 'genres', 'language', 'movie_title', 'movie_imdb_link', 'plot_keywords', 'color']
numerical_variables = [v for v in variables if v not in categorical_variables]

categorical_df = movies_df[categorical_variables]
numerical_df = movies_df[numerical_variables]

## 3. Processando variáveis categóricas

In [6]:
# Descrição do dataset categórico puro
categorical_df.describe()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,genres,language,movie_title,movie_imdb_link,plot_keywords,color
count,5025,5023,5016,5024,4928,5027,5018,5027,5027,4881,5011
unique,2086,3026,3518,64,2387,911,47,4901,4903,4751,2
top,Robert De Niro,Morgan Freeman,John Heard,USA,Steven Spielberg,Drama,English,King Kong,http://www.imdb.com/title/tt1976009/?ref_=fn_t...,based on novel,Color
freq,49,20,8,3796,26,236,4693,3,3,4,4802


In [7]:
# Amostra do dataset
categorical_df.head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,genres,language,movie_title,movie_imdb_link,plot_keywords,color
0,CCH Pounder,Joel David Moore,Wes Studi,USA,James Cameron,Action|Adventure|Fantasy|Sci-Fi,English,Avatar,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,avatar|future|marine|native|paraplegic,Color
1,Johnny Depp,Orlando Bloom,Jack Davenport,USA,Gore Verbinski,Action|Adventure|Fantasy,English,Pirates of the Caribbean: At World's End,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,goddess|marriage ceremony|marriage proposal|pi...,Color
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,UK,Sam Mendes,Action|Adventure|Thriller,English,Spectre,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,bomb|espionage|sequel|spy|terrorist,Color
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,USA,Christopher Nolan,Action|Thriller,English,The Dark Knight Rises,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,deception|imprisonment|lawlessness|police offi...,Color
5,Daryl Sabara,Samantha Morton,Polly Walker,USA,Andrew Stanton,Action|Adventure|Sci-Fi,English,John Carter,http://www.imdb.com/title/tt0401729/?ref_=fn_t...,alien|american civil war|male nipple|mars|prin...,Color


### 3.0 Uniformizando o texto

In [8]:
# Tirando todos os espaços em branco antes e depois das palavras
categorical_df = categorical_df.applymap(lambda x: x.strip() if type(x) is str else x)

### 3.1 Tratando valores ausentes
Estratégias:
1. Nomes dos atores: Deixar ausente
2. País: 
3. Diretor
4. Idioma:
5. Título: deixar ausente
6. Keywords: deixar ausente
8. Link do imdb: deixar ausente
9. Cor: colorido se número de likes no fb != nan

In [9]:
# Atores
missing = categorical_df[categorical_df['actor_1_name'].isna() & categorical_df['actor_2_name'].isna() | categorical_df['actor_3_name'].isna()]
missing.describe()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,genres,language,movie_title,movie_imdb_link,plot_keywords,color
count,9,7,0.0,11,11,11,11,11,11,9,11
unique,9,7,0.0,7,11,7,2,11,11,9,1
top,Jacques Perrin,Bobby Kendall,,USA,Jacques Perrin,Documentary,English,"Pink Ribbons, Inc.",http://www.imdb.com/title/tt2017038/?ref_=fn_t...,climate|earth|global warming|science|truth,Color
freq,1,1,,5,1,5,10,1,1,1,11


In [10]:
#Cor
is_color = numerical_df['movie_facebook_likes'] > 0

# Não tem valor e é colorido
color_df = categorical_df[is_color & categorical_df['color'].isna()]
color_df['color'] = 'Color'
categorical_df.update(color_df)

# Não tem valor e é preto e branco
color_df = categorical_df[~is_color & categorical_df['color'].isna()]
color_df['color'] = 'Black and White'
categorical_df.update(color_df)
categorical_df['color'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


count      5027
unique        2
top       Color
freq       4812
Name: color, dtype: object

### 3.2 Processando colunas em formato de lista (gênero e keywords)

In [11]:
#Processando gênero
# Transformando string em lista
genres =  categorical_df.genres.apply(lambda x: x.split('|'))
genres.head()

0    [Action, Adventure, Fantasy, Sci-Fi]
1            [Action, Adventure, Fantasy]
2           [Action, Adventure, Thriller]
3                      [Action, Thriller]
5             [Action, Adventure, Sci-Fi]
Name: genres, dtype: object

In [12]:
genres = genres.apply(pd.Series)
genres.rename(lambda x: 'genre_{}'.format(x), axis='columns', inplace=True)
genres.head()

Unnamed: 0,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7
0,Action,Adventure,Fantasy,Sci-Fi,,,,
1,Action,Adventure,Fantasy,,,,,
2,Action,Adventure,Thriller,,,,,
3,Action,Thriller,,,,,,
5,Action,Adventure,Sci-Fi,,,,,


In [13]:
# Convertendo para binário
genres = pd.get_dummies(genres)
genres_labels = genres.columns
genres.head()

Unnamed: 0,genre_0_Action,genre_0_Adventure,genre_0_Animation,genre_0_Biography,genre_0_Comedy,genre_0_Crime,genre_0_Documentary,genre_0_Drama,genre_0_Family,genre_0_Fantasy,...,genre_6_Fantasy,genre_6_Musical,genre_6_Mystery,genre_6_Romance,genre_6_Sci-Fi,genre_6_Sport,genre_6_Thriller,genre_6_War,genre_7_Romance,genre_7_Thriller
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# Removendo a lista de gêneros e substituindo pelos dummies
categorical_df.drop('genres', axis='columns', inplace=True)
categorical_df = pd.concat([categorical_df, genres])
categorical_df.loc[:, genres_labels] = categorical_df.loc[:, genres_labels].replace(np.nan, 0)
categorical_df.head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,color,country,director_name,genre_0_Action,genre_0_Adventure,genre_0_Animation,genre_0_Biography,...,genre_6_Sci-Fi,genre_6_Sport,genre_6_Thriller,genre_6_War,genre_7_Romance,genre_7_Thriller,language,movie_imdb_link,movie_title,plot_keywords
0,CCH Pounder,Joel David Moore,Wes Studi,Color,USA,James Cameron,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,English,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,Avatar,avatar|future|marine|native|paraplegic
1,Johnny Depp,Orlando Bloom,Jack Davenport,Color,USA,Gore Verbinski,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,English,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,Pirates of the Caribbean: At World's End,goddess|marriage ceremony|marriage proposal|pi...
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Color,UK,Sam Mendes,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,English,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,Spectre,bomb|espionage|sequel|spy|terrorist
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Color,USA,Christopher Nolan,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,English,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,The Dark Knight Rises,deception|imprisonment|lawlessness|police offi...
5,Daryl Sabara,Samantha Morton,Polly Walker,Color,USA,Andrew Stanton,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,English,http://www.imdb.com/title/tt0401729/?ref_=fn_t...,John Carter,alien|american civil war|male nipple|mars|prin...
