#                     Projeto Final CDD- Prever a nota de um filme no IDB

In [22]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import seaborn as sns
import numpy as np
from scipy.stats import norm, probplot
import statsmodels.api as sm
from mpl_toolkits.mplot3d import Axes3D
import json

RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)

In [23]:
### Função utilitária para fazer a regressão com constante adicionada
def regress(X,Y):
    X_cp = sm.add_constant(X)
    model = sm.OLS(Y,X_cp)
    results = model.fit()
    return results

In [24]:
dados_filmes= pd.read_csv("tmdb_5000_movies.csv")

In [25]:
dados_filmes.drop(dados_filmes[dados_filmes["budget"]==0].index,inplace=True)


# Começaremos analisando os dados do Dataset

## As variáveis numéricas que analisaremos, para ver se há uma relação com a nota do filme, será as numéricas. Estas serão:

### title, runtime budget, revenue, popularity, vote_average, vote_count

## Construindo o Dataframe

In [26]:
dados_novos = dados_filmes[["title","runtime","budget","revenue", "vote_average", "vote_count", "popularity","genres",'release_date']].copy()

#### Criação de colunas com o nome dos generos, sendo 1 para quando o filme é daquela categoria e 0 para quando não é , isso foi feito para tirar a coluna genre

In [27]:
genres = set()
for x in dados_novos['genres']:
    for g in json.loads(x):
        genres.add(g["name"])

genres

{'Action',
 'Adventure',
 'Animation',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Foreign',
 'History',
 'Horror',
 'Music',
 'Mystery',
 'Romance',
 'Science Fiction',
 'TV Movie',
 'Thriller',
 'War',
 'Western'}

#### Criacão do novo dataframe incluindo as novas colunas de gênero

In [28]:
def extrai_generos(item):
    generos = {'c_' + x: 0 for x in ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 
                              'Documentary', 'Drama', 'Family', 'Fantasy', 'Foreign', 
                              'History', 'Horror', 'Music', 'Mystery', 'Romance', 
                              'Science Fiction', 'TV Movie', 'Thriller', 'War', 'Western']}
    for g in json.loads(item):
        generos['c_' + g['name']] += 1
    return generos

dados_novos_teste = pd.DataFrame(list(dados_novos['genres'].apply(extrai_generos).values))

dados_novos_junto = dados_novos.join(dados_novos_teste)

dados_novos_junto.set_index("title",inplace=True)

In [29]:
dados_novos_junto.drop(columns= "genres", inplace=True)


dados_novos_junto

Unnamed: 0_level_0,runtime,budget,revenue,vote_average,vote_count,popularity,release_date,c_Action,c_Adventure,c_Animation,...,c_History,c_Horror,c_Music,c_Mystery,c_Romance,c_Science Fiction,c_TV Movie,c_Thriller,c_War,c_Western
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,162.0,237000000,2787965087,7.2,11800,150.437577,2009-12-10,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
Pirates of the Caribbean: At World's End,169.0,300000000,961000000,6.9,4500,139.082615,2007-05-19,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Spectre,148.0,245000000,880674609,6.3,4466,107.376788,2015-10-26,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Dark Knight Rises,165.0,250000000,1084939099,7.6,9106,112.312950,2012-07-16,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
John Carter,132.0,260000000,284139100,6.1,2124,43.926995,2012-03-07,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
Spider-Man 3,139.0,258000000,890871626,5.9,3576,115.699814,2007-05-01,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Tangled,100.0,260000000,591794936,7.4,3330,48.681969,2010-11-24,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Avengers: Age of Ultron,141.0,280000000,1405403694,7.3,6767,134.279229,2015-04-22,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
Harry Potter and the Half-Blood Prince,153.0,250000000,933959197,7.4,5293,98.885637,2009-07-07,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Batman v Superman: Dawn of Justice,151.0,250000000,873260194,5.7,7004,155.790452,2016-03-23,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Criação de novas colunas que contém o mês, ano, e dia , cada uma em sua própia coluna no dataframe 

In [30]:
dados_novos_junto['ano'] , dados_novos_junto['mes'] , dados_novos_junto['dia'] = dados_novos_junto['release_date'].str.split('-',2).str
dados_novos_junto['ano'] = pd.to_numeric(dados_novos_junto['ano'])
dados_novos_junto['mes'] = pd.to_numeric(dados_novos_junto['mes'])
dados_novos_junto['dia']  = pd.to_numeric(dados_novos_junto['dia'] )

## Variáveis para a análise

**Dicionário de dados:**


Variável | Descrição
:---:|---:
title | Título do filme
runtime | Tempo de filme
budget | Orçamento
revenue | Receita
vote_average | Média dos votos
vote_count | Número de votos das pessoas
popularity | Popularidade
genres | Os diferentes gêneros de filme
c_Action | 0 quando o filme não pertence à esse gênero, e 1 quando pertence
c_Adventure | 0 quando o filme não pertence à esse gênero, e 1 quando pertence
c_Animation | 0 quando o filme não pertence à esse gênero, e 1 quando pertence
c_Comedy | 0 quando o filme não pertence à esse gênero, e 1 quando pertence
c_Crime | 0 quando o filme não pertence à esse gênero, e 1 quando pertence
c_Documentary | 0 quando o filme não pertence à esse gênero, e 1 quando pertence
c_Drama | 0 quando o filme não pertence à esse gênero, e 1 quando pertence
c_Family | 0 quando o filme não pertence à esse gênero, e 1 quando pertence
c_Fantasy | 0 quando o filme não pertence à esse gênero, e 1 quando pertence
c_Foreign | 0 quando o filme não pertence à esse gênero, e 1 quando pertence
c_History | 0 quando o filme não pertence à esse gênero, e 1 quando pertence
c_Horror | 0 quando o filme não pertence à esse gênero, e 1 quando pertence
c_Music | 0 quando o filme não pertence à esse gênero, e 1 quando pertence
c_Mystery | 0 quando o filme não pertence à esse gênero, e 1 quando pertence
c_Romance | 0 quando o filme não pertence à esse gênero, e 1 quando pertence
c_Science Fiction | 0 quando o filme não pertence à esse gênero, e 1 quando pertence
c_TV Movie | 0 quando o filme não pertence à esse gênero, e 1 quando pertence
c_Thriller | 0 quando o filme não pertence à esse gênero, e 1 quando pertence
c_War | 0 quando o filme não pertence à esse gênero, e 1 quando pertence
c_Western | 0 quando o filme não pertence à esse gênero, e 1 quando pertence
ano | ano em que o filme foi lançado
mês | mês em que o filme foi lançado
dia | dia em que o filme foi lançado

## Com a ajuda do comando Describe, poderemos analisar fatores importantes do Dataframe


 

In [31]:
dados_novos_junto.describe()

Unnamed: 0,runtime,budget,revenue,vote_average,vote_count,popularity,c_Action,c_Adventure,c_Animation,c_Comedy,...,c_Mystery,c_Romance,c_Science Fiction,c_TV Movie,c_Thriller,c_War,c_Western,ano,mes,dia
count,3764.0,3766.0,3766.0,3766.0,3766.0,3766.0,3221.0,3221.0,3221.0,3221.0,...,3221.0,3221.0,3221.0,3221.0,3221.0,3221.0,3221.0,3766.0,3766.0,3766.0
mean,109.324389,37042840.0,103954700.0,6.226474,856.496814,25.984641,0.282521,0.20801,0.06054,0.350512,...,0.074201,0.17417,0.1276,0.000931,0.289662,0.035082,0.017075,2002.090813,6.935741,15.433617
std,20.753408,42646510.0,177639900.0,0.966228,1344.78569,34.438253,0.450295,0.405947,0.238522,0.477204,...,0.262138,0.379314,0.333696,0.030509,0.453676,0.184016,0.129573,12.876561,3.39316,8.520344
min,0.0,1.0,0.0,0.0,0.0,0.001586,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1916.0,1.0,1.0
25%,95.0,8000000.0,6009736.0,5.7,114.0,7.97499,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1999.0,4.0,9.0
50%,105.5,23000000.0,38831370.0,6.3,365.5,17.604955,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2005.0,7.0,15.0
75%,120.0,50000000.0,122087300.0,6.9,969.25,33.925801,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2011.0,10.0,22.0
max,338.0,380000000.0,2787965000.0,10.0,13752.0,875.581305,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2016.0,12.0,31.0


### Tirando os dados Nan

In [32]:
df = dados_novos_junto

df = df.dropna()

### Verificando valores nulos das variáveis em certa coluna

In [33]:
np.sum(df.isnull())

runtime              0
budget               0
revenue              0
vote_average         0
vote_count           0
popularity           0
release_date         0
c_Action             0
c_Adventure          0
c_Animation          0
c_Comedy             0
c_Crime              0
c_Documentary        0
c_Drama              0
c_Family             0
c_Fantasy            0
c_Foreign            0
c_History            0
c_Horror             0
c_Music              0
c_Mystery            0
c_Romance            0
c_Science Fiction    0
c_TV Movie           0
c_Thriller           0
c_War                0
c_Western            0
ano                  0
mes                  0
dia                  0
dtype: int64

## Separação das  variáveis

In [34]:
Y = df[ "vote_average"]

In [35]:
X = df[["runtime","budget","revenue","vote_count", "popularity","c_Action", "c_Adventure","c_Animation","c_Comedy","c_History",
        "c_Horror","c_Music","c_Mystery","c_Romance","c_Science Fiction","c_TV Movie","c_Thriller","c_War","c_Western","mes","ano"]]

## Uso da função Regress

In [36]:
results = regress(X,Y)
results.summary()

0,1,2,3
Dep. Variable:,vote_average,R-squared:,0.378
Model:,OLS,Adj. R-squared:,0.374
Method:,Least Squares,F-statistic:,92.47
Date:,"Mon, 12 Nov 2018",Prob (F-statistic):,7.39e-310
Time:,16:55:29,Log-Likelihood:,-3351.2
No. Observations:,3220,AIC:,6746.0
Df Residuals:,3198,BIC:,6880.0
Df Model:,21,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,20.6025,2.595,7.941,0.000,15.515,25.690
runtime,0.0144,0.001,22.121,0.000,0.013,0.016
budget,-6.176e-09,4.31e-10,-14.343,0.000,-7.02e-09,-5.33e-09
revenue,-2.854e-11,1.19e-10,-0.239,0.811,-2.62e-10,2.05e-10
vote_count,0.0003,1.62e-05,17.857,0.000,0.000,0.000
popularity,0.0012,0.001,2.391,0.017,0.000,0.002
c_Action,-0.0122,0.033,-0.373,0.709,-0.076,0.052
c_Adventure,-0.0941,0.035,-2.683,0.007,-0.163,-0.025
c_Animation,0.2337,0.055,4.257,0.000,0.126,0.341

0,1,2,3
Omnibus:,276.787,Durbin-Watson:,2.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,464.042
Skew:,-0.625,Prob(JB):,1.72e-101
Kurtosis:,4.376,Cond. No.,48700000000.0


Adotamos um ALFA de 5%, então tiramos os valores que tem um P>T maior que o ALFA 

In [37]:
X = df[["runtime","budget","revenue","vote_count", "popularity","mes","ano"]]

Separando o dataframe em teste e treinamento

In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=RANDOM_SEED)

Treinando, criando , testando e calculando o modelo de regressão linear

In [39]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()       #cria
model.fit(X_train, y_train)        #treina 
y_pred = model.predict(X_test)      #testa 

1 - mean_squared_error(y_test, y_pred) / np.var(y_test)

0.38747462761081053

In [40]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

model = RandomForestRegressor()       #cria
model.fit(X_train, y_train)        #treina 
y_pred = model.predict(X_test)      #testa 

1 - mean_squared_error(y_test, y_pred) / np.var(y_test)

0.45583843342651187