# Vanilla Linear Regression

In this notebook we are going to predict the aggregated popularity, revenue and budget for actors

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import csv
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from scipy.stats import iqr
from ast import literal_eval
from project_utils import *
from pandas.io.json import json_normalize
from functools import reduce

In [None]:
actors_dataset_df = pd.read_pickle("actors_dataset_df.pkl")
actors_dataset_df.head()

In [None]:
X_budget, y_budget = get_train_feats_and_gt(actors_dataset_df,"budget",["community"])
X_budget_com, y_budget_com = get_train_feats_and_gt(actors_dataset_df,"budget")
X_revenue, y_revenue = get_train_feats_and_gt(actors_dataset_df,"revenue",["community"])
X_revenue_com, y_revenue_com = get_train_feats_and_gt(actors_dataset_df,"revenue")
X_popularity, y_popularity = get_train_feats_and_gt(actors_dataset_df,"popularity",["community"])
X_popularity_com, y_popularity_com = get_train_feats_and_gt(actors_dataset_df,"popularity")

## Predict features

In [None]:
seed=10

### Predict budget

In [None]:
X_train_budget, X_test_budget, y_train_budget, y_test_budget = train_test_split(X_budget,y_budget,test_size=0.2,random_state=seed)
budget_lr = LinearRegression()
fit_budget_lr = budget_lr.fit(X_train_budget,y_train_budget)
y_pred_budget = fit_budget_lr.predict(X_test_budget)
budget_nmae = nmae(y_test_budget,y_pred_budget,"range")
print("Normalized MAE budget: "+str(budget_nmae))

### Predict budget using communities

In [None]:
X_train_budget_com, X_test_budget_com, y_train_budget_com, y_test_budget_com = train_test_split(X_budget_com,y_budget_com,test_size=0.2,random_state=seed)
budget_com_lr = LinearRegression()
fit_budget_com_lr = budget_com_lr.fit(X_train_budget_com,y_train_budget_com)
y_pred_budget_com = fit_budget_com_lr.predict(X_test_budget_com)
budget_com_nmae = nmae(y_test_budget_com,y_pred_budget_com,"range")
print("Normalized MAE budget: "+str(budget_com_nmae))

In [None]:
print("Improvement percentage of budget prediction using communities: " +str((budget_nmae-budget_com_nmae)*100/budget_nmae)+" %")

### Predict revenue 

In [None]:
X_train_revenue, X_test_revenue, y_train_revenue, y_test_revenue = train_test_split(X_revenue,y_revenue,test_size=0.2,random_state=seed)
revenue_lr = LinearRegression()
fit_revenue_lr = revenue_lr.fit(X_train_revenue,y_train_revenue)
y_pred_revenue = fit_revenue_lr.predict(X_test_revenue)
revenue_nmae = nmae(y_test_revenue,y_pred_revenue,"range")
print("Normalized MAE revenue: "+str(revenue_nmae))

### Predict revenue using communities

In [None]:
X_train_revenue_com, X_test_revenue_com, y_train_revenue_com, y_test_revenue_com = train_test_split(X_revenue_com,y_revenue_com,test_size=0.2,random_state=seed)
revenue_com_lr = LinearRegression()
fit_revenue_com_lr = revenue_com_lr.fit(X_train_revenue_com,y_train_revenue_com)
y_pred_revenue_com = fit_revenue_com_lr.predict(X_test_revenue_com)
revenue_com_nmae = nmae(y_test_revenue_com,y_pred_revenue_com,"range")
print("Normalized MAE revenue: "+str(revenue_com_nmae))

In [None]:
print("Improvement percentage of revenue prediction using communities: " +str((revenue_nmae-revenue_com_nmae)*100/revenue_nmae)+" %")

### Predict popularity 

In [None]:
X_train_popularity, X_test_popularity, y_train_popularity, y_test_popularity = train_test_split(X_popularity,y_popularity,test_size=0.2,random_state=seed)
popularity_lr = LinearRegression()
fit_popularity_lr = popularity_lr.fit(X_train_popularity,y_train_popularity)
y_pred_popularity = fit_popularity_lr.predict(X_test_popularity)
popularity_nmae = nmae(y_test_popularity,y_pred_popularity,"range")
print("Normalized MAE popularity: "+str(popularity_nmae))

### Predict popularity using communities

In [None]:
X_train_popularity_com, X_test_popularity_com, y_train_popularity_com, y_test_popularity_com = train_test_split(X_popularity_com,y_popularity_com,test_size=0.2,random_state=seed)
popularity_com_lr = LinearRegression()
fit_popularity_com_lr = popularity_com_lr.fit(X_train_popularity_com,y_train_popularity_com)
y_pred_popularity_com = fit_popularity_com_lr.predict(X_test_popularity_com)
popularity_com_nmae = nmae(y_test_popularity_com,y_pred_popularity_com,"range")
print("Normalized MAE popularity: "+str(popularity_com_nmae))

In [None]:
print("Improvement percentage of popularity prediction using communities: " +str((popularity_nmae-popularity_com_nmae)*100/popularity_nmae)+" %")