#                     Projeto Final CDD- Prever a nota de um filme no IDB

In [237]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import seaborn as sns
import numpy as np
from scipy.stats import norm, probplot
import statsmodels.api as sm
from mpl_toolkits.mplot3d import Axes3D
import json

RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)

In [238]:
### Função utilitária para fazer a regressão com constante adicionada
def regress(X,Y):
    X_cp = sm.add_constant(X)
    model = sm.OLS(Y,X_cp)
    results = model.fit()
    return results

In [239]:
dados_filmes= pd.read_csv("tmdb_5000_movies.csv")

In [240]:
dados_filmes.drop(dados_filmes[dados_filmes["budget"]==0].index,inplace=True)

# Começaremos analisando os dados do Dataset

## As variáveis numéricas que analisaremos, para ver se há uma relação com a nota do filme, será as numéricas. Estas serão:

### title, runtime budget, revenue, popularity, vote_average, vote_count

## Construindo o Dataframe

In [241]:
dados_novos = dados_filmes[["title","runtime","budget","revenue", "vote_average", "vote_count", "popularity","genres"]].copy()

#### Criação de colunas com o nome dos generos, sendo 1 para quando o filme é daquela categoria e 0 para quando não é , isso foi feito para tirar a coluna genre

In [242]:
genres = set()
for x in dados_novos['genres']:
    for g in json.loads(x):
        genres.add(g["name"])

genres

{'Action',
 'Adventure',
 'Animation',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Foreign',
 'History',
 'Horror',
 'Music',
 'Mystery',
 'Romance',
 'Science Fiction',
 'TV Movie',
 'Thriller',
 'War',
 'Western'}

#### Criacão do novo dataframe incluindo as novas colunas de gênero

In [243]:
def extrai_generos(item):
    generos = {'c_' + x: 0 for x in ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 
                              'Documentary', 'Drama', 'Family', 'Fantasy', 'Foreign', 
                              'History', 'Horror', 'Music', 'Mystery', 'Romance', 
                              'Science Fiction', 'TV Movie', 'Thriller', 'War', 'Western']}
    for g in json.loads(item):
        generos['c_' + g['name']] += 1
    return generos

dados_novos_teste = pd.DataFrame(list(dados_novos['genres'].apply(extrai_generos).values))

dados_novos_junto = dados_novos.join(dados_novos_teste)

dados_novos_junto.set_index("title",inplace=True)

In [244]:
dados_novos_junto.drop(columns= "genres", inplace=True)



## Variáveis para a análise

**Dicionário de dados:**


Variável | Descrição
:---:|---:
title | Título do filme
runtime | Tempo de filme
budget | Orçamento
revenue | Receita
vote_average | Média dos votos
vote_count | Número de votos das pessoas
popularity | Popularidade
genres | Os diferentes gêneros de filme

## Com a ajuda do comando Describe, poderemos analisar fatores importantes do Dataframe


 

In [245]:
dados_novos_junto.describe()

Unnamed: 0,runtime,budget,revenue,vote_average,vote_count,popularity,c_Action,c_Adventure,c_Animation,c_Comedy,...,c_History,c_Horror,c_Music,c_Mystery,c_Romance,c_Science Fiction,c_TV Movie,c_Thriller,c_War,c_Western
count,3764.0,3766.0,3766.0,3766.0,3766.0,3766.0,3221.0,3221.0,3221.0,3221.0,...,3221.0,3221.0,3221.0,3221.0,3221.0,3221.0,3221.0,3221.0,3221.0,3221.0
mean,109.324389,37042840.0,103954700.0,6.226474,856.496814,25.984641,0.282521,0.20801,0.06054,0.350512,...,0.044707,0.105247,0.034772,0.074201,0.17417,0.1276,0.000931,0.289662,0.035082,0.017075
std,20.753408,42646510.0,177639900.0,0.966228,1344.78569,34.438253,0.450295,0.405947,0.238522,0.477204,...,0.206691,0.306919,0.18323,0.262138,0.379314,0.333696,0.030509,0.453676,0.184016,0.129573
min,0.0,1.0,0.0,0.0,0.0,0.001586,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,95.0,8000000.0,6009736.0,5.7,114.0,7.97499,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,105.5,23000000.0,38831370.0,6.3,365.5,17.604955,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,120.0,50000000.0,122087300.0,6.9,969.25,33.925801,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
max,338.0,380000000.0,2787965000.0,10.0,13752.0,875.581305,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Verificando valores nulos das variáveis em certa coluna

In [246]:
np.sum(df.isnull())

runtime              0
budget               0
revenue              0
vote_average         0
vote_count           0
popularity           0
c_Action             0
c_Adventure          0
c_Animation          0
c_Comedy             0
c_Crime              0
c_Documentary        0
c_Drama              0
c_Family             0
c_Fantasy            0
c_Foreign            0
c_History            0
c_Horror             0
c_Music              0
c_Mystery            0
c_Romance            0
c_Science Fiction    0
c_TV Movie           0
c_Thriller           0
c_War                0
c_Western            0
dtype: int64

### Tirando os dados Nan

In [247]:
df = dados_novos_junto

df = df.dropna()

## Separação das  variáveis

In [248]:
Y = df[ "vote_average"]

In [249]:
X = df[["runtime","budget","revenue","vote_count", "popularity","c_Action", "c_Adventure","c_Animation","c_Comedy","c_History",
        "c_Horror","c_Music","c_Mystery","c_Romance","c_Science Fiction","c_TV Movie","c_Thriller","c_War","c_Western"]]

## Uso da função Regress

In [250]:
results = regress(X,Y)
results.summary()

0,1,2,3
Dep. Variable:,vote_average,R-squared:,0.363
Model:,OLS,Adj. R-squared:,0.359
Method:,Least Squares,F-statistic:,95.94
Date:,"Thu, 08 Nov 2018",Prob (F-statistic):,1.42e-295
Time:,15:04:26,Log-Likelihood:,-3389.3
No. Observations:,3220,AIC:,6819.0
Df Residuals:,3200,BIC:,6940.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.4779,0.073,61.722,0.000,4.336,4.620
runtime,0.0160,0.001,25.557,0.000,0.015,0.017
budget,-6.789e-09,4.26e-10,-15.941,0.000,-7.62e-09,-5.95e-09
revenue,8.025e-11,1.2e-10,0.670,0.503,-1.55e-10,3.15e-10
vote_count,0.0003,1.63e-05,16.980,0.000,0.000,0.000
popularity,0.0011,0.001,2.082,0.037,6.29e-05,0.002
c_Action,-0.0096,0.033,-0.291,0.771,-0.075,0.055
c_Adventure,-0.0911,0.035,-2.568,0.010,-0.161,-0.022
c_Animation,0.2458,0.056,4.427,0.000,0.137,0.355

0,1,2,3
Omnibus:,263.254,Durbin-Watson:,2.001
Prob(Omnibus):,0.0,Jarque-Bera (JB):,437.514
Skew:,-0.604,Prob(JB):,9.89e-96
Kurtosis:,4.342,Cond. No.,7470000000.0


Adotamos um ALFA de 5%, então tiramos os valores que tem um P>T maior que o ALFA 

In [251]:
X = df[["runtime","budget","revenue","vote_count", "popularity"]]

Separando o dataframe em teste e treinamento

In [252]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=RANDOM_SEED)

Treinando, criando , testando e calculando o modelo de regressão linear

In [253]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()       #cria
model.fit(X_train, y_train)        #treina 
y_pred = model.predict(X_test)      #testa 

1 - mean_squared_error(y_test, y_pred) / np.var(y_test)

0.38094718882784806

In [254]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

model = RandomForestRegressor()       #cria
model.fit(X_train, y_train)        #treina 
y_pred = model.predict(X_test)      #testa 

1 - mean_squared_error(y_test, y_pred) / np.var(y_test)

0.4281948562992448