In [66]:
import pandas as pd
import numpy as np

In [67]:
# Importando dados
movies_df = pd.read_csv('../filmes.csv')
# Visão geral das colunas e dados ausentes
movies_df.isnull().sum()

color                         19
director_name                104
num_critic_for_reviews        50
duration                      15
director_facebook_likes      104
actor_3_facebook_likes        23
actor_2_name                  13
actor_1_facebook_likes         7
gross                        884
genres                         0
actor_1_name                   7
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                  23
facenumber_in_poster          13
plot_keywords                153
movie_imdb_link                0
num_user_for_reviews          21
language                      12
country                        5
content_rating               303
budget                       492
title_year                   108
actor_2_facebook_likes        13
imdb_score                     0
aspect_ratio                 329
movie_facebook_likes           0
dtype: int64

## 1. Deletando entradas com 25% dos valores ausentes

In [68]:
num_variables = len(movies_df.columns)
threshold = 0.25*num_variables
print("Número de variáveis: {}. Limite de {} valores ausentes".format(num_variables, threshold))

# 1. Substituir 0's por NaN's
movies_df = movies_df.applymap(lambda x: np.nan if x == 0.0 else x)
size = len(movies_df)
# 2. Contabilizar valores ausentes por linha
count_missing = movies_df.isnull().sum(axis='columns')
# 3. Remover
movies_df = movies_df[count_missing < threshold]
print("{} de {} entradas removidas".format(size-len(movies_df), size))

Número de variáveis: 28. Limite de 7.0 valores ausentes
92 de 5043 entradas removidas


In [69]:
# Tirando todos os espaços em branco antes e depois das palavras
movies_df = movies_df.applymap(lambda x: x.strip() if type(x) is str else x)

## 2.0 Separando variáveis categóricas das variáveis numéricas

In [70]:
categorical_variables = ['actor_1_name', 'actor_2_name', 'actor_3_name', 'country', 'director_name', 'genres', 'language', 'movie_title', 'movie_imdb_link', 'plot_keywords', 'color', 'content_rating', 'title_year', 'aspect_ratio']
numerical_variables = [v for v in movies_df.columns if v not in categorical_variables]

## 3. Processando variáveis categóricas

In [71]:
# Descrição do dataset categórico puro
movies_df[categorical_variables].astype(str).describe()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,genres,language,movie_title,movie_imdb_link,plot_keywords,color,content_rating,title_year,aspect_ratio
count,4951,4951,4951,4951,4951.0,4951,4951,4951,4951,4951.0,4951,4951,4951.0,4951.0
unique,2039,2969,3466,62,2358.0,902,46,4824,4827,4710.0,3,19,92.0,23.0
top,Robert De Niro,Morgan Freeman,Ben Mendelsohn,USA,,Drama,English,Home,http://www.imdb.com/title/tt2638144/?ref_=fn_t...,,Color,R,2009.0,2.35
freq,49,20,8,3757,62.0,231,4630,3,3,113.0,4731,2117,259.0,2358.0


In [72]:
# Amostra do dataset
movies_df[categorical_variables].head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,country,director_name,genres,language,movie_title,movie_imdb_link,plot_keywords,color,content_rating,title_year,aspect_ratio
0,CCH Pounder,Joel David Moore,Wes Studi,USA,James Cameron,Action|Adventure|Fantasy|Sci-Fi,English,Avatar,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,avatar|future|marine|native|paraplegic,Color,PG-13,2009.0,1.78
1,Johnny Depp,Orlando Bloom,Jack Davenport,USA,Gore Verbinski,Action|Adventure|Fantasy,English,Pirates of the Caribbean: At World's End,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,goddess|marriage ceremony|marriage proposal|pi...,Color,PG-13,2007.0,2.35
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,UK,Sam Mendes,Action|Adventure|Thriller,English,Spectre,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,bomb|espionage|sequel|spy|terrorist,Color,PG-13,2015.0,2.35
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,USA,Christopher Nolan,Action|Thriller,English,The Dark Knight Rises,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,deception|imprisonment|lawlessness|police offi...,Color,PG-13,2012.0,2.35
5,Daryl Sabara,Samantha Morton,Polly Walker,USA,Andrew Stanton,Action|Adventure|Sci-Fi,English,John Carter,http://www.imdb.com/title/tt0401729/?ref_=fn_t...,alien|american civil war|male nipple|mars|prin...,Color,PG-13,2012.0,2.35


In [73]:
movies_df[categorical_variables].isnull().sum()

actor_1_name         0
actor_2_name         0
actor_3_name         6
country              1
director_name       62
genres               0
language             7
movie_title          0
movie_imdb_link      0
plot_keywords      113
color               12
content_rating     236
title_year          62
aspect_ratio       275
dtype: int64

### 3.1 Tratando valores ausentes

#### Atores

In [74]:
missing = movies_df[movies_df['actor_1_name'].isna() & movies_df['actor_2_name'].isna() | movies_df['actor_3_name'].isna()]
missing

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
1782,Color,Jacques Perrin,100.0,81.0,63.0,,Philippe Labro,63.0,10762178.0,Documentary,...,153.0,English,France,G,160000000.0,2001.0,3.0,8.0,1.85,1000.0
3528,Color,Gus Van Sant,103.0,103.0,835.0,,Casey Affleck,13000.0,236266.0,Adventure|Drama|Mystery,...,290.0,English,USA,R,3500000.0,2002.0,,6.2,2.35,
3610,Color,Jean-Jacques Mantello,9.0,42.0,,,Daryl Hannah,844.0,7518876.0,Adventure|Documentary|Short,...,5.0,English,UK,,6000000.0,2008.0,,6.5,1.78,28.0
4225,Color,James Algar,99.0,120.0,11.0,,Deems Taylor,16.0,76400000.0,Animation|Family|Fantasy|Music,...,230.0,English,USA,G,2280000.0,1940.0,,7.8,1.37,3000.0
4548,Color,Davis Guggenheim,372.0,96.0,49.0,,Al Gore,861.0,23808111.0,Documentary,...,504.0,English,USA,PG,,2006.0,68.0,7.5,1.85,
5009,Color,James Bidgood,8.0,65.0,,,Bobby Kendall,,8231.0,Drama|Fantasy,...,16.0,English,USA,Not Rated,27000.0,1971.0,,6.7,1.37,85.0


In [75]:
## Os filmes que faltam o 3o autor são antigos e em pouca quantidade. Deletar.
movies_df = movies_df[~movies_df['actor_3_name'].isna()]
movies_df[categorical_variables].isnull().sum()

actor_1_name         0
actor_2_name         0
actor_3_name         0
country              1
director_name       62
genres               0
language             7
movie_title          0
movie_imdb_link      0
plot_keywords      113
color               12
content_rating     235
title_year          62
aspect_ratio       275
dtype: int64

#### Title year and aspect ratio

In [76]:
movies_df.title_year.value_counts().head()

2009.0    259
2014.0    244
2006.0    237
2013.0    231
2010.0    229
Name: title_year, dtype: int64

In [77]:
movies_df[movies_df['title_year'].isna()][categorical_variables].isnull().sum()
# Todos os que faltam ano faltam nome do diretor, duas coisas importantes, eliminando entradas

actor_1_name        0
actor_2_name        0
actor_3_name        0
country             0
director_name      62
genres              0
language            0
movie_title         0
movie_imdb_link     0
plot_keywords       0
color               0
content_rating      9
title_year         62
aspect_ratio        1
dtype: int64

In [78]:
movies_df = movies_df[~movies_df['title_year'].isna()]

In [79]:
movies_df.aspect_ratio.value_counts().head()
# tem 275 entradas com um aspect ration faltando, se atribuir um valor padrão vai se tornar uma das categorias mais
#frequentes

2.35    2355
1.85    1900
1.37      97
1.78      90
1.66      63
Name: aspect_ratio, dtype: int64

In [80]:
movies_df[movies_df['aspect_ratio'].isna()][categorical_variables].isnull().sum()
# pela tabela abaixo eliminando todos os q faltam aspect ratio, elimina boa parte dos que faltam keywords e rating

actor_1_name         0
actor_2_name         0
actor_3_name         0
country              0
director_name        0
genres               0
language             2
movie_title          0
movie_imdb_link      0
plot_keywords       53
color                6
content_rating      90
title_year           0
aspect_ratio       274
dtype: int64

In [81]:
movies_df = movies_df[~movies_df['aspect_ratio'].isna()]

In [82]:
movies_df[categorical_variables].isnull().sum() 
# Ao eliminar as entradas com aspect ratio e title year ausentes, eliminaos as entradas com diretor ausente

actor_1_name         0
actor_2_name         0
actor_3_name         0
country              1
director_name        0
genres               0
language             5
movie_title          0
movie_imdb_link      0
plot_keywords       60
color                6
content_rating     136
title_year           0
aspect_ratio         0
dtype: int64

#### País

In [83]:
country_df = movies_df.country;
country_df.describe()

count     4608
unique      57
top        USA
freq      3508
Name: country, dtype: object

In [84]:
print("Tabela de Frequências")
country_df.value_counts().head()

Tabela de Frequências


USA        3508
UK          416
France      143
Canada      106
Germany      95
Name: country, dtype: int64

In [85]:
## Valores Ausentes
movies_df[movies_df['country'].isna()]

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
4021,Color,Daniel Petrie Jr.,9.0,88.0,19.0,322.0,Jeff Fahey,795.0,,Drama|Thriller,...,13.0,English,,,3500000.0,2014.0,535.0,4.8,2.35,570.0


In [86]:
# Ajustando todos valores ausentes manualmente, baseando-se no país
#categorical_df.loc[2370, 'country'] = "USA"
#categorical_df.loc[2370, 'director_name'] = "Ben Affleck"

#categorical_df.loc[3397, 'country'] = "USA"
#categorical_df.loc[3397, 'director_name'] = "Sam Catlin"

movies_df.loc[4021, 'country'] = "USA"

#### Diretor - sem valores ausentes

#### Gêneros - sem valores ausentes

#### Keywords

In [87]:
movies_df['plot_keywords'].astype(str).describe()

count     4609
unique    4429
top        nan
freq        60
Name: plot_keywords, dtype: object

In [88]:
movies_df[movies_df['plot_keywords'].isna()].describe() #Estratégia: categoria padrao

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
count,58.0,60.0,46.0,56.0,60.0,22.0,60.0,60.0,37.0,60.0,52.0,60.0,57.0,60.0,60.0,46.0
mean,21.086207,97.433333,92.108696,217.285714,1332.983333,1189567.0,1389.466667,2231.15,2.027027,13.366667,5858885.0,2012.3,312.54386,5.91,2.578833,1580.152174
std,25.144693,14.340138,203.624298,230.003535,4654.548872,3394531.0,2438.509947,5135.931635,1.536346,19.243283,10808330.0,3.455774,298.168461,1.294408,2.53177,5232.791687
min,1.0,59.0,3.0,2.0,10.0,3330.0,6.0,12.0,1.0,1.0,10000.0,2002.0,5.0,2.6,1.33,4.0
25%,5.25,89.75,8.0,33.75,160.5,24191.5,127.5,250.75,1.0,4.0,375000.0,2011.0,63.0,5.2,1.85,66.5
50%,12.5,97.0,24.0,132.0,432.0,85196.0,395.5,1049.5,2.0,7.5,1750000.0,2013.0,213.0,6.15,2.35,193.5
75%,22.75,103.25,61.0,319.25,898.75,218072.8,1649.75,2429.0,2.0,13.5,8000000.0,2015.0,445.0,6.95,2.35,396.5
max,131.0,141.0,1000.0,975.0,35000.0,14946230.0,15978.0,38121.0,7.0,122.0,65000000.0,2016.0,1000.0,8.1,16.0,26000.0


In [89]:
#remover
movies_df = movies_df[~movies_df['plot_keywords'].isna()]

#### Idioma

In [90]:
movies_df['language'].value_counts().head()

English     4277
French        63
Spanish       33
Mandarin      25
German        18
Name: language, dtype: int64

In [91]:
movies_df.loc[movies_df['language'].isna()]

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
3086,Color,Christopher Cain,43.0,111.0,58.0,258.0,Taylor Handley,482.0,1066555.0,Drama|History|Romance|Western,...,111.0,,USA,R,11000000.0,2007.0,362.0,5.8,1.85,411.0
3869,Color,Mel Brooks,39.0,87.0,,753.0,Dom DeLuise,898.0,,Comedy|Romance,...,61.0,,USA,PG,4400000.0,1976.0,842.0,6.7,1.85,629.0
4810,Black and White,D.W. Griffith,69.0,123.0,204.0,9.0,Mae Marsh,436.0,,Drama|History|War,...,88.0,,USA,Not Rated,385907.0,1916.0,22.0,8.0,1.33,691.0
4885,Black and White,King Vidor,48.0,151.0,54.0,6.0,Renée Adorée,81.0,,Drama|Romance|War,...,45.0,,USA,Not Rated,245000.0,1925.0,12.0,8.3,1.33,226.0
4958,Black and White,Harry F. Millarde,1.0,110.0,,,Johnnie Walker,2.0,3000000.0,Crime|Drama,...,1.0,,USA,,100000.0,1920.0,2.0,4.8,1.33,


In [92]:
#Todos os que faltam são americanos, logo são em inglês
movies_df.loc[movies_df['language'].isna(), 'language'] = 'English'

#### Classificação Indicativa

In [93]:
movies_df.content_rating.value_counts()

R            2013
PG-13        1411
PG            665
G             105
Not Rated      83
Approved       55
Unrated        53
X              13
Passed          9
NC-17           7
M               5
GP              5
TV-14           3
TV-G            3
TV-PG           1
Name: content_rating, dtype: int64

In [94]:
movies_df[movies_df['content_rating'].isna()]

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
98,Color,Hideaki Anno,1.0,120.0,28.0,12.0,Shin'ya Tsukamoto,544.0,,Action|Adventure|Drama|Horror|Sci-Fi,...,13.0,Japanese,Japan,,,2016.0,106.0,8.2,2.35,
204,Color,Hideaki Anno,1.0,120.0,28.0,12.0,Shin'ya Tsukamoto,544.0,,Action|Adventure|Drama|Horror|Sci-Fi,...,13.0,Japanese,Japan,,,2016.0,106.0,8.2,2.35,
242,Color,Frédéric Forestier,33.0,116.0,,141.0,Santiago Segura,936.0,,Adventure|Comedy|Family|Fantasy,...,36.0,French,France,,78000000.0,2008.0,276.0,5.1,2.35,291.0
489,Color,Lucile Hadzihalilovic,63.0,81.0,92.0,8.0,Roxane Duran,23.0,,Drama|Horror|Mystery|Sci-Fi,...,3.0,French,France,,,2015.0,21.0,6.4,2.35,257.0
810,Color,Pou-Soi Cheang,14.0,119.0,3.0,22.0,Aaron Kwok,879.0,,Action|Adventure|Fantasy,...,9.0,English,China,,68005000.0,2016.0,107.0,6.0,2.35,426.0
1044,Color,Chuck Bowman,32.0,89.0,42.0,235.0,Lochlyn Munro,598.0,,Horror|Thriller,...,32.0,English,USA,,1500000.0,2006.0,555.0,4.6,1.78,352.0
1329,Color,S.S. Rajamouli,44.0,159.0,50.0,72.0,Anushka Shetty,218.0,6498000.0,Action|Adventure|Drama|Fantasy|War,...,410.0,Telugu,India,,18026148.0,2015.0,133.0,8.4,1.85,21000.0
1724,Color,Laurent Tirard,56.0,91.0,2.0,55.0,Sandrine Kiberlain,295.0,,Comedy|Family,...,17.0,French,France,,,2009.0,71.0,7.2,1.85,
1730,Color,Olivier Dahan,132.0,103.0,61.0,520.0,Frank Langella,963.0,,Biography|Drama|Romance,...,62.0,English,Switzerland,,30000000.0,2014.0,903.0,5.7,2.35,6000.0
1793,Color,Agustín Díaz Yanes,31.0,145.0,13.0,278.0,Elena Anaya,10000.0,,Adventure|Drama|History|Romance|Thriller|War,...,84.0,Spanish,Spain,,24000000.0,2006.0,1000.0,6.1,1.85,


In [95]:
# substituir pela moda
movies_df.content_rating.fillna(movies_df.content_rating.mode(), inplace=True)

In [96]:
# Tabela de frequência acumulada
movies_df.content_rating.value_counts().cumsum() / len(movies_df.content_rating) * 100

R            44.251484
PG-13        75.269290
PG           89.887887
G            92.196087
Not Rated    94.020664
Approved     95.229721
Unrated      96.394812
X            96.680589
Passed       96.878435
NC-17        97.032315
M            97.142229
GP           97.252143
TV-14        97.318092
TV-G         97.384040
TV-PG        97.406023
Name: content_rating, dtype: float64

#### Cor

In [97]:
# Visualização dos dados antes de tratar dados ausentes
movies_df['color'].astype(str).value_counts()

Color              4346
Black and White     198
nan                   5
Name: color, dtype: int64

In [98]:
#Estratégia: remover as que faltam, dado que são poucas
movies_df = movies_df[~movies_df['color'].isna()]

### 3.2 Codificando colunas simples

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import string

def remove_special_chars(x, exceptions = ['|']):
    x = x.replace(' ', '').lower()
    for char in string.punctuation:
        if char not in exceptions:
            x = x.replace(char, '')
    return x
string.punctuation

#### Atores

In [None]:
columns_actors = ['actor_1_name', 'actor_2_name', 'actor_3_name']
categorical_df[columns_actors] = categorical_df[columns_actors].astype(str).applymap(remove_special_chars)
actors = categorical_df[columns_actors].values.flatten()

In [None]:
actor_vectorizer = CountVectorizer()
actor_vectorizer.fit(actors)
vocabulary = sorted(actor_vectorizer.vocabulary_)

In [None]:
join_actors = categorical_df[columns_actors[0]].astype(str) +'|' +  categorical_df[columns_actors[1]].astype(str) + '|' +categorical_df[columns_actors[2]].astype(str)
categorical_df.drop(columns = columns_actors, inplace=True)
join_actors.head()

In [None]:
#actors_vect =  actor_vectorizer.transform(categorical_df['actors']).toarray()
#categorical_df['actors'] = actors_vect
vect_actors = actor_vectorizer.transform(join_actors.values)
actors_df =pd.DataFrame(vect_actors.toarray())
actors_df.rename(lambda x: 'actor_'+vocabulary[int(x)], axis='columns', inplace=True)
categorical_df = categorical_df.join(actors_df)
categorical_df.head()

#### País, diretor, idioma, Título, Cor, Classificação Indicativa, Título e Link

In [None]:
#categorical_df.drop(columns=['movie_title', 'movie_imdb_link'], inplace=True) # não importam
variables = ['country', 'director_name', 'language', 'color', 'content_rating']
dummies = pd.get_dummies(categorical_df[variables])
categorical_df.drop(columns=variables, inplace=True)
categorical_df = categorical_df.join(dummies)
categorical_df.head()

### 3.3 Codificando colunas em formato de lista (gênero e keywords)

#### Gêneros

In [None]:
#Processando gênero
# Transformando string em lista
genres =  categorical_df.genres.apply(remove_special_chars)
genres.head()

In [None]:
vocabulary = set()
genres.apply(lambda x: vocabulary.update(x.split('|')))

genre_vectorizer = CountVectorizer()
genre_vectorizer.fit(vocabulary)
vocabulary = sorted(genre_vectorizer.vocabulary_)
vocabulary

In [None]:
vect_genres = genre_vectorizer.transform(genres.values)
genres_binary_df = pd.DataFrame(vect_genres.toarray())
genres_binary_df.rename(lambda x: 'genre_'+vocabulary[int(x)], axis='columns', inplace=True)
genres_binary_df.head()

In [None]:
# Colocando de volta no Dataframe
categorical_df.drop('genres', axis='columns', inplace=True)
categorical_df = categorical_df.join(genres_binary_df)
categorical_df.head()

#### Keywords

In [None]:
#Lembrete do problema de '1000000 b.c.
keywords = categorical_df.plot_keywords.astype(str).apply(lambda x: x.split('|'))
all_keywords = set()
keywords.apply(lambda x: all_keywords.update(x))
sorted(all_keywords)

In [None]:
# Processando keywords
keywords = categorical_df.plot_keywords.astype(str).apply(remove_special_chars)
all_keywords = set()
keywords.apply(lambda x: all_keywords.update(x.split('|')))
len(all_keywords)

In [None]:
keyword_vectorizer = CountVectorizer()
keyword_vectorizer.fit(all_keywords)
vocabulary = sorted(keyword_vectorizer.vocabulary_)
len(vocabulary)

In [None]:
vect_keywords = genre_vectorizer.transform(keywords.values)
keywords_binary_df = pd.DataFrame(vect_keywords.toarray())
keywords_binary_df.rename(lambda x: 'keyword_'+vocabulary[int(x)], axis='columns', inplace=True)
keywords_binary_df.head()

In [None]:
# Colocando de volta no Dataframe
categorical_df.drop('plot_keywords', axis='columns', inplace=True)
categorical_df = categorical_df.join(keywords_binary_df)
categorical_df.head()

## 4. Processando variáveis numéricas

In [None]:
numerical_df.head()

### 4.1 Valores Ausentes

In [None]:
#### Visão geral
#pd.options.mode.chained_assignment = None ## remover dps

In [None]:
numerical_df.isnull().sum()

In [None]:
# Tabela de correlação par a par (Coeficiente de Pearson)
numerical_df.corr()

In [None]:
correlation_thresold = 0.8
# Identificando colunas altamente relacionadas
correlation_matrix = numerical_df.corr().abs()
# Seleciona o triângulo superior da matriz de correlação
upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(np.bool))
# Seleciona colunas a serem removidas
to_drop = [column for column in upper.columns if any(upper[column] > correlation_thresold)]
to_drop

In [None]:
# Removendo colunas com alta correlação e revisando as colunas disponíveis
numerical_df.drop(numerical_df[to_drop], axis='columns', inplace=True)
numerical_df.isnull().sum()

#### Tratando os valores ausentes para cada coluna individualmente

#### Facenumber in poster

In [None]:
# Quase 50% das amostras não tem esses dados, deletar a coluna
numerical_df.drop('facenumber_in_poster', axis='columns', inplace=True)

#### Num critic reviews

In [None]:
numerical_df.num_critic_for_reviews.describe()

In [None]:
#Substituindo pela média
numerical_df.num_critic_for_reviews.fillna(numerical_df.num_critic_for_reviews.mean(), inplace=True)

#### Duração

In [None]:
# Subs. pela media, pois poucos valores
numerical_df['duration'].fillna(numerical_df.duration.mean(), inplace=True)

In [None]:
numerical_df.isnull().sum()

#### Director facebook likes

In [None]:
#nome dos diretores sem likes
movies_df.director_facebook_likes.describe()

In [None]:
# Opção - Modelo preditivo simples com regressão usando decision tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score

In [None]:
regressor = DecisionTreeRegressor(random_state=0)
#data = pd.concat([categorical_df.dropna(), numerical_df])
data = numerical_df.dropna()
data_x = data.drop('director_facebook_likes', axis='columns')
data_y = data.director_facebook_likes
results = cross_val_score(regressor, data_x, data_y, cv=10, scoring="neg_mean_absolute_error")
np.mean(results)

In [None]:
regressor.fit(data_x, data_y)
x = numerical_df.drop('director_facebook_likes', axis='columns').fillna(numerical_df.mean())
predicted_likes = regressor.predict(x)

numerical_df.director_facebook_likes.fillna(pd.Series(predicted_likes), inplace=True)

In [None]:
numerical_df.isnull().sum()

In [None]:
numerical_df.director_facebook_likes.fillna(numerical_df.director_facebook_likes.mean(), inplace=True)
numerical_df.isnull().sum()

#### Actor Facebook Likes

In [None]:
#substituir pela média
columns = ['actor_1_facebook_likes', 'actor_2_facebook_likes', 'actor_3_facebook_likes']
for c in columns:
    numerical_df[c].fillna(numerical_df[c].mean(), inplace=True)
numerical_df.isnull().sum()

#### Gross

In [None]:
numerical_df.gross.describe()

In [None]:
regressor = DecisionTreeRegressor(random_state=0)
data = numerical_df.dropna()
data_x = data.drop('gross', axis='columns')
data_y = data.gross
results = cross_val_score(regressor, data_x, data_y, cv=10, scoring="neg_mean_absolute_error")
np.mean(results)

In [None]:
# Regressão utilizando arvore de decisao
regressor.fit(data_x, data_y)
x = numerical_df.drop('director_facebook_likes', axis='columns').fillna(numerical_df.mean())
predicted_gross = regressor.predict(x)

numerical_df.gross.fillna(pd.Series(predicted_likes), inplace=True)

In [None]:
numerical_df.isnull().sum()

In [None]:
numerical_df.gross.fillna(numerical_df.gross.mean(), inplace=True)

In [None]:
numerical_df.isnull().sum()

#### num_user_for_reviews

In [None]:
#média
numerical_df.num_user_for_reviews.fillna(numerical_df.num_user_for_reviews.mean(), inplace=True)
numerical_df.isnull().sum()

#### budget

In [None]:
numerical_df.budget.describe()

In [None]:
# regressão com decision tree
regressor = DecisionTreeRegressor(random_state=0)
data = numerical_df.dropna()
data_x = data.drop('budget', axis='columns')
data_y = data.budget
results = cross_val_score(regressor, data_x, data_y, cv=10, scoring="neg_mean_absolute_error")
np.mean(results)

In [None]:
# Regressão utilizando arvore de decisao
regressor.fit(data_x, data_y)
x = numerical_df.drop('budget', axis='columns').fillna(numerical_df.mean())
predicted_gross = regressor.predict(x)

numerical_df.budget.fillna(pd.Series(predicted_likes), inplace=True)

In [None]:
numerical_df.isnull().sum()

In [None]:
numerical_df.budget.fillna(numerical_df.budget.mean(), inplace=True)
numerical_df.isnull().sum()

### 4.2 Normalizando os dados

In [None]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
numerical_vals = numerical_df.values
numerical_vals = scaler.fit_transform(numerical_vals)
numerical_df = pd.DataFrame(numerical_vals, columns=numerical_df.columns, index=numerical_df.index)
numerical_df.head()

## 5.0 Agrupando resultados e exportando

In [None]:
result = numerical_df.join(categorical_df)
result.head()

In [None]:
len(result)

In [None]:
result.to_csv('../models/data.csv')