#                     Projeto Final CDD- Prever a nota de um filme no IDB

In [153]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import seaborn as sns
import numpy as np
from scipy.stats import norm, probplot
import statsmodels.api as sm
from mpl_toolkits.mplot3d import Axes3D
import json

RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)

In [154]:
### Função utilitária para fazer a regressão com constante adicionada
def regress(X,Y):
    X_cp = sm.add_constant(X)
    model = sm.OLS(Y,X_cp)
    results = model.fit()
    return results

In [155]:
dados_filmes= pd.read_csv("tmdb_5000_movies.csv")

# Começaremos analisando os dados do Dataset

## As variáveis numéricas que analisaremos, para ver se há uma relação com a nota do filme, será as numéricas. Estas serão:

### title, runtime budget, revenue, popularity, vote_average, vote_count

## Construindo o Dataframe

In [156]:
dados_novos = dados_filmes[["title","runtime","budget","revenue", "vote_average", "vote_count", "popularity","genres"]].copy()



#### Criação de colunas com o nome dos generos, sendo 1 para quando o filme é daquela categoria e 0 para quando não é , isso foi feito para tirar a coluna genre

In [157]:
genres = set()
for x in dados_novos['genres']:
    for g in json.loads(x):
        genres.add(g["name"])

genres

{'Action',
 'Adventure',
 'Animation',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Foreign',
 'History',
 'Horror',
 'Music',
 'Mystery',
 'Romance',
 'Science Fiction',
 'TV Movie',
 'Thriller',
 'War',
 'Western'}

#### Criacão do novo dataframe incluindo as novas colunas de gênero

In [158]:
def extrai_generos(item):
    generos = {'c_' + x: 0 for x in ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 
                              'Documentary', 'Drama', 'Family', 'Fantasy', 'Foreign', 
                              'History', 'Horror', 'Music', 'Mystery', 'Romance', 
                              'Science Fiction', 'TV Movie', 'Thriller', 'War', 'Western']}
    for g in json.loads(item):
        generos['c_' + g['name']] += 1
    return generos

dados_novos_teste = pd.DataFrame(list(dados_novos['genres'].apply(extrai_generos).values))

dados_novos_junto = dados_novos.join(dados_novos_teste)

dados_novos_junto.set_index("title",inplace=True)

In [159]:
dados_novos_junto.drop(columns= "genres", inplace=True)



## Variáveis para a análise

**Dicionário de dados:**


Variável | Descrição
:---:|---:
title | Título do filme
runtime | Tempo de filme
budget | Orçamento
revenue | Receita
vote_average | Média dos votos
vote_count | Número de votos das pessoas
popularity | Popularidade
genres | Os diferentes gêneros de filme

## Com a ajuda do comando Describe, poderemos analisar fatores importantes do Dataframe


 

In [160]:
dados_novos_junto.describe()

Unnamed: 0,runtime,budget,revenue,vote_average,vote_count,popularity,c_Action,c_Adventure,c_Animation,c_Comedy,...,c_History,c_Horror,c_Music,c_Mystery,c_Romance,c_Science Fiction,c_TV Movie,c_Thriller,c_War,c_Western
count,4801.0,4803.0,4803.0,4803.0,4803.0,4803.0,4803.0,4803.0,4803.0,4803.0,...,4803.0,4803.0,4803.0,4803.0,4803.0,4803.0,4803.0,4803.0,4803.0,4803.0
mean,106.875859,29045040.0,82260640.0,6.092172,690.217989,21.492301,0.240267,0.164481,0.04872,0.358526,...,0.041016,0.108057,0.038518,0.072455,0.186134,0.111389,0.001666,0.265251,0.029981,0.017073
std,22.611935,40722390.0,162857100.0,1.194612,1234.585891,31.81665,0.42729,0.37075,0.215304,0.479618,...,0.198348,0.310485,0.192462,0.259266,0.389255,0.314646,0.040782,0.441513,0.170553,0.129556
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,94.0,790000.0,0.0,5.6,54.0,4.66807,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,103.0,15000000.0,19170000.0,6.2,235.0,12.921594,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,118.0,40000000.0,92917190.0,6.8,737.0,28.313505,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
max,338.0,380000000.0,2787965000.0,10.0,13752.0,875.581305,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [161]:
df = dados_novos_junto

### Verificando valores nulos das variáveis em certa coluna

In [162]:
np.sum(df.isnull())

runtime              2
budget               0
revenue              0
vote_average         0
vote_count           0
popularity           0
c_Action             0
c_Adventure          0
c_Animation          0
c_Comedy             0
c_Crime              0
c_Documentary        0
c_Drama              0
c_Family             0
c_Fantasy            0
c_Foreign            0
c_History            0
c_Horror             0
c_Music              0
c_Mystery            0
c_Romance            0
c_Science Fiction    0
c_TV Movie           0
c_Thriller           0
c_War                0
c_Western            0
dtype: int64

### Tirando os dados Nan

In [163]:
df = df.dropna()

## Separação das  variáveis

In [164]:
Y = df[ "vote_average"]

In [165]:
X = df[["runtime","budget","revenue","vote_count", "popularity","c_Action", "c_Adventure","c_Animation","c_Comedy","c_History",
        "c_Horror","c_Music","c_Mystery","c_Romance","c_Science Fiction","c_TV Movie","c_Thriller","c_War","c_Western"]]

## Uso da função Regress

In [166]:
results = regress(X,Y)
results.summary()

0,1,2,3
Dep. Variable:,vote_average,R-squared:,0.249
Model:,OLS,Adj. R-squared:,0.246
Method:,Least Squares,F-statistic:,83.64
Date:,"Thu, 08 Nov 2018",Prob (F-statistic):,3.83e-280
Time:,14:34:34,Log-Likelihood:,-6964.1
No. Observations:,4801,AIC:,13970.0
Df Residuals:,4781,BIC:,14100.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.2513,0.088,48.479,0.000,4.079,4.423
runtime,0.0171,0.001,21.893,0.000,0.016,0.019
budget,-5.303e-09,5.76e-10,-9.202,0.000,-6.43e-09,-4.17e-09
revenue,-4.88e-10,1.77e-10,-2.757,0.006,-8.35e-10,-1.41e-10
vote_count,0.0003,2.42e-05,13.181,0.000,0.000,0.000
popularity,0.0032,0.001,4.226,0.000,0.002,0.005
c_Action,-0.2030,0.042,-4.854,0.000,-0.285,-0.121
c_Adventure,-0.0304,0.047,-0.641,0.522,-0.123,0.062
c_Animation,0.5867,0.076,7.688,0.000,0.437,0.736

0,1,2,3
Omnibus:,2475.762,Durbin-Watson:,1.957
Prob(Omnibus):,0.0,Jarque-Bera (JB):,27660.925
Skew:,-2.21,Prob(JB):,0.0
Kurtosis:,13.897,Cond. No.,4590000000.0


Separando o dataframe em teste e treinamento

In [167]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=RANDOM_SEED)

Treinando, criando , testando e calculando o modelo de regressão linear

In [168]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()       #cria
model.fit(X_train, y_train)        #treina 
y_pred = model.predict(X_test)      #testa 

1 - mean_squared_error(y_test, y_pred) / np.var(y_test)

0.27665709161969665