# Vanilla Linear Regression

In this notebook we are going to predict the aggregated popularity, revenue and budget for actors

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import csv
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from scipy.stats import iqr
from ast import literal_eval
from project_utils import *
from pandas.io.json import json_normalize
from functools import reduce

In [2]:
actors_dataset_df = pd.read_pickle("actors_dataset_df.pkl")
actors_dataset_df.head()

Unnamed: 0_level_0,budget,popularity,revenue,vote_average,vote_count,community
actor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,17000000.0,14.215532,22638520.0,5.75,256.0,0
1,36652500.0,30.167837,99939180.0,6.175,772.625,1
2,37500000.0,19.503003,103468200.0,6.5,906.75,0
3,1100000.0,3.226925,6015562.0,6.9,53.0,0
4,33250000.0,21.136737,63901460.0,6.25,631.75,2


In [9]:
def remove_elements_from_list(l,elements):
    elements_set = set(elements)
    mod_l = set(l)
    mod_l = list(mod_l - elements_set)
    return mod_l

In [10]:
def get_train_feats_and_gt(df,gt_col,remove_cols = None):
    """Get the train features and groundtruth col
        
        Args:
            df(pandas.DataFrame):
            gt_col(str)         :
            remove_cols(list)   :
        Returns:
            A numpy.ndarray containing the features
            A numpy.ndarray containing the labels
    """
    df_cols = list(df.columns)
    if remove_cols:
        feat_cols = remove_elements_from_list(df_cols,remove_cols + [gt_col])
    else:
        feat_cols = remove_elements_from_list(df_cols,[gt_col])
    X = df[feat_cols].values
    y = df[gt_col].values
    return X, y

In [11]:
def nmae(y_gt,y_pred,den_type="iqr"):
    """Calculate the normalized mean-absolute error
        
        Can be normalized by 3 quantities calculated on the groundtruth:
        - iqr: 'Interquartile range'
        - range: 'Max-min range'
        - std: 'Standard deviation'
        
        Args:
            y_gt(numpy.ndarray)    :  the groundtruth values
            y_pred(numpy.ndarray)  :  the predicted values
            den_type(str)          :  the type of denominator           
        Returns:
            A float that is the value of the nmae
    """
    if den_type == "iqr":
        den = iqr(y_gt)
    elif den_type == "range":
        den = np.max(y_gt) - np.min(y_gt)
    elif den_type == "std":
        den = np.std(y_gt)
    else:
        raise ValueError("Normalized MAE can only handle iqr, range and std")
    return mean_absolute_error(y_gt,y_pred)/iqr(y_gt)

In [12]:
def one_hot_encode_feats(X,cols):
    """One hot encode feature
        Args:
            X(numpy.ndarray)                           :   the features
            cols(list)                                 :   list of column numbers of the features to be one-hot encoded
        Returns:
            A numpy.ndarray containing the encoded features
            A OneHotEncoder object       
    """
    enc = OneHotEncoder(handle_unknown="ignore",categorical_features=cols)
    encoded_feats = enc.fit_transform(X)
    return encoded_feats,enc

In [29]:
X_budget, y_budget = get_train_feats_and_gt(actors_dataset_df,"budget",["community"])
X_budget_com, y_budget_com = get_train_feats_and_gt(actors_dataset_df,"budget")
X_revenue, y_revenue = get_train_feats_and_gt(actors_dataset_df,"revenue",["community"])
X_revenue_com, y_revenue_com = get_train_feats_and_gt(actors_dataset_df,"revenue")
X_popularity, y_popularity = get_train_feats_and_gt(actors_dataset_df,"popularity",["community"])
X_popularity_com, y_popularity_com = get_train_feats_and_gt(actors_dataset_df,"popularity")

## Predict features

In [30]:
seed=10

### Predict budget

In [60]:
X_train_budget, X_test_budget, y_train_budget, y_test_budget = train_test_split(X_budget,y_budget,test_size=0.2,random_state=seed)
budget_lr = LinearRegression()
fit_budget_lr = budget_lr.fit(X_train_budget,y_train_budget)
y_pred_budget = fit_budget_lr.predict(X_test_budget)
budget_nmae = nmae(y_test_budget,y_pred_budget,"range")
print("Normalized MAE budget: "+str(budget_nmae))

Normalized MAE budget: 0.2527618842062534


### Predict budget using communities

In [45]:
X_train_budget_com, X_test_budget_com, y_train_budget_com, y_test_budget_com = train_test_split(X_budget_com,y_budget_com,test_size=0.2,random_state=seed)
budget_com_lr = LinearRegression()
fit_budget_com_lr = budget_com_lr.fit(X_train_budget_com,y_train_budget_com)
y_pred_budget_com = fit_budget_com_lr.predict(X_test_budget_com)
budget_com_nmae = nmae(y_test_budget_com,y_pred_budget_com,"range")
print("Normalized MAE budget: "+str(budget_com_nmae))

Normalized MAE budget: 0.23126956803501916


In [50]:
print("Improvement percentage of budget prediction using communities: " +str((budget_nmae-budget_com_nmae)*100/budget_nmae)+" %")

Improvement percentage of budget prediction using communities: 8.50298938019331 %


### Predict revenue 

In [51]:
X_train_revenue, X_test_revenue, y_train_revenue, y_test_revenue = train_test_split(X_revenue,y_revenue,test_size=0.2,random_state=seed)
revenue_lr = LinearRegression()
fit_revenue_lr = revenue_lr.fit(X_train_revenue,y_train_revenue)
y_pred_revenue = fit_revenue_lr.predict(X_test_revenue)
revenue_nmae = nmae(y_test_revenue,y_pred_revenue,"range")
print("Normalized MAE revenue: "+str(revenue_nmae))

Normalized MAE revenue: 0.23467298745023915


### Predict revenue using communities

In [52]:
X_train_revenue_com, X_test_revenue_com, y_train_revenue_com, y_test_revenue_com = train_test_split(X_revenue_com,y_revenue_com,test_size=0.2,random_state=seed)
revenue_com_lr = LinearRegression()
fit_revenue_com_lr = revenue_com_lr.fit(X_train_revenue_com,y_train_revenue_com)
y_pred_revenue_com = fit_revenue_com_lr.predict(X_test_revenue_com)
revenue_com_nmae = nmae(y_test_revenue_com,y_pred_revenue_com,"range")
print("Normalized MAE revenue: "+str(revenue_com_nmae))

Normalized MAE revenue: 0.22787730426535835


In [61]:
print("Improvement percentage of revenue prediction using communities: " +str((revenue_nmae-revenue_com_nmae)*100/revenue_nmae)+" %")

Improvement percentage of revenue prediction using communities: 2.8958097217396106 %


### Predict popularity 

In [63]:
X_train_popularity, X_test_popularity, y_train_popularity, y_test_popularity = train_test_split(X_popularity,y_popularity,test_size=0.2,random_state=seed)
popularity_lr = LinearRegression()
fit_popularity_lr = popularity_lr.fit(X_train_popularity,y_train_popularity)
y_pred_popularity = fit_popularity_lr.predict(X_test_popularity)
popularity_nmae = nmae(y_test_popularity,y_pred_popularity,"range")
print("Normalized MAE popularity: "+str(popularity_nmae))

Normalized MAE popularity: 0.23811801059246626


### Predict popularity using communities

In [64]:
X_train_popularity_com, X_test_popularity_com, y_train_popularity_com, y_test_popularity_com = train_test_split(X_popularity_com,y_popularity_com,test_size=0.2,random_state=seed)
popularity_com_lr = LinearRegression()
fit_popularity_com_lr = popularity_com_lr.fit(X_train_popularity_com,y_train_popularity_com)
y_pred_popularity_com = fit_popularity_com_lr.predict(X_test_popularity_com)
popularity_com_nmae = nmae(y_test_popularity_com,y_pred_popularity_com,"range")
print("Normalized MAE popularity: "+str(popularity_com_nmae))

Normalized MAE popularity: 0.23683402554734675


In [65]:
print("Improvement percentage of popularity prediction using communities: " +str((popularity_nmae-popularity_com_nmae)*100/popularity_nmae)+" %")

Improvement percentage of popularity prediction using communities: 0.5392221453239918 %
