 ### -REGRESSÃO LINEAR MÚLTIPLA

É mútipla pois nela são tratadas duas ou mais variáveis explicativas

In [53]:
%matplotlib inline
%matplotlib notebook
%reset -f
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import norm, probplot
import statsmodels.api as sm
from mpl_toolkits.mplot3d import Axes3D

RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)

### Ler o arquivo

In [54]:
df = pd.read_csv("tmdb_5000_movies.csv")
df.drop(df[df['budget']==0].index, inplace=True)

### limpeza do dataframe 

In [55]:
df_novo = df.filter(['budget', 'genres', 'popularity', 'production_companies', 'production_countries',
                     'release_date', 'revenue', 'runtime', 'vote_average', 'vote_count'], axis=1)

### Função utilitária para fazer a regressão com constante adicionada

In [56]:
def regress(X,Y):
    X_cp = sm.add_constant(X)
    model = sm.OLS(Y,X_cp)
    results = model.fit()
    return results

Criação de colunas com o nome dos generos, sendo 1 para quando o filme é daquela categoria e 0 para quando não é , isso foi feito para tirar a coluna genre

In [57]:
import json

genres = set()
for x in df_novo['genres']:
    for g in json.loads(x):
        genres.add(g["name"])


criacao do novo dataframe incluindo as novas colunas de genero

In [58]:
def extrai_generos(item):
    generos = {'c_' + x: 0 for x in ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 
                              'Documentary', 'Drama', 'Family', 'Fantasy', 'Foreign', 
                              'History', 'Horror', 'Music', 'Mystery', 'Romance', 
                              'Science Fiction', 'TV Movie', 'Thriller', 'War', 'Western']}
    for g in json.loads(item):
        generos['c_' + g['name']] += 1
    return generos

df_teste = pd.DataFrame(list(df_novo['genres'].apply(extrai_generos).values))
df_junto = df_novo.join(df_teste)

In [59]:
df_junto['revenue'].value_counts()

0            537
8000000        6
12000000       5
100000000      5
6000000        5
7000000        5
5000000        4
10000000       4
11000000       4
14000000       4
17000000       3
4000000        3
13000000       3
7800000        3
25000000       3
30000000       3
60000000       3
14400000       3
32000000       3
36000000       2
2200000        2
28200000       2
110000000      2
102000000      2
10400000       2
70000000       2
8500000        2
11             2
94000000       2
15000000       2
            ... 
104303851      1
2401510        1
691210692      1
83719388       1
193355800      1
123729176      1
150680864      1
32222567       1
159616327      1
28575078       1
60700000       1
113020255      1
7103838        1
98159963       1
195268056      1
190213455      1
179379533      1
52034889       1
99067206       1
149044513      1
150406466      1
128769345      1
6600000        1
5363000        1
24261569       1
84460846       1
156505388      1
146292009     

adição das novas colunas com data no dataframe

In [60]:
df_junto['ano'] , df_junto['mes'] , df_junto['dia'] = df_junto['release_date'].str.split('-',2).str
df_junto['ano'] = pd.to_numeric(df_junto['ano'])
df_junto['mes'] = pd.to_numeric(df_junto['mes'])
df_junto['dia']  = pd.to_numeric(df_junto['dia'] )

Obtendo as variáveis resposta e preditora:


-Variável resposta , ou seja , as notas do site IMDB


-Variável preditora , ou seja , todas as colunas que serão usadas para prever as notas dos filmes

In [61]:
df_sem_nan = df_junto.dropna()

In [62]:
Y = df_sem_nan['vote_average']

In [63]:
X = df_sem_nan[['budget', 'popularity', 'runtime', 'vote_count','ano','mes']]

In [64]:
results = regress(X,Y)
results.summary()

0,1,2,3
Dep. Variable:,vote_average,R-squared:,0.37
Model:,OLS,Adj. R-squared:,0.369
Method:,Least Squares,F-statistic:,315.1
Date:,"Thu, 08 Nov 2018",Prob (F-statistic):,2.55e-318
Time:,15:49:36,Log-Likelihood:,-3370.2
No. Observations:,3220,AIC:,6754.0
Df Residuals:,3213,BIC:,6797.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,20.5914,2.580,7.981,0.000,15.533,25.650
budget,-6.295e-09,3.41e-10,-18.478,0.000,-6.96e-09,-5.63e-09
popularity,0.0012,0.001,2.304,0.021,0.000,0.002
runtime,0.0141,0.001,21.765,0.000,0.013,0.015
vote_count,0.0003,1.41e-05,20.376,0.000,0.000,0.000
ano,-0.0080,0.001,-6.284,0.000,-0.011,-0.006
mes,0.0216,0.004,5.963,0.000,0.014,0.029

0,1,2,3
Omnibus:,276.872,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,464.31
Skew:,-0.625,Prob(JB):,1.4999999999999998e-101
Kurtosis:,4.377,Cond. No.,13000000000.0


Separando o dataframe em teste e treinamento

In [65]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=RANDOM_SEED)

Treinando, criando , testando e calculando o modelo de regressão linear

In [66]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()       #cria
model.fit(X_train, y_train)        #treina 
y_pred = model.predict(X_test)      #testa 

1 - mean_squared_error(y_test, y_pred) / np.var(y_test)

0.3879962273565626

Aplicando o Random Forest Regressor

In [68]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

model = RandomForestRegressor()       #cria
model.fit(X_train, y_train)        #treina 
y_pred = model.predict(X_test)      #testa 

1 - mean_squared_error(y_test, y_pred) / np.var(y_test)



0.45877898530966144