# Vanilla Linear Regression

In this notebook we are going to predict the aggregated popularity, revenue and budget for actors

In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import csv
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from scipy.stats import iqr
from ast import literal_eval
from project_utils import *
from pandas.io.json import json_normalize
from functools import reduce

In [51]:
actors_dataset_df = pd.read_pickle("actors_dataset_df.pkl")
actors_dataset_df.head()

Unnamed: 0_level_0,budget,popularity,revenue,vote_average,vote_count,community
actor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,17000000.0,14.215532,22638520.0,5.75,256.0,0
1,36652500.0,30.167837,99939180.0,6.175,772.625,1
2,37500000.0,19.503003,103468200.0,6.5,906.75,0
3,1100000.0,3.226925,6015562.0,6.9,53.0,0
4,33250000.0,21.136737,63901460.0,6.25,631.75,2


In [36]:
def remove_elements_from_list(l,elements):
    elements_set = set(elements)
    mod_l = set(l)
    mod_l = list(mod_l - elements_set)
    return mod_l

In [37]:
def get_train_feats_and_gt(df,gt_col,remove_cols = None):
    """Get the train features and groundtruth col
        
        Args:
            df(pandas.DataFrame):
            gt_col(str)         :
            remove_cols(list)   :
        Returns:
            A numpy.ndarray containing the features
            A numpy.ndarray containing the labels
    """
    df_cols = list(df.columns)
    if remove_cols:
        feat_cols = remove_elements(df_cols,remove_cols + [gt_col])
    else:
        feat_cols = remove_elements(df_cols,[gt_col])
    X = df[feat_cols].values
    y = df[gt_col].values
    return X, y

In [43]:
def nmae(y_gt,y_pred,den_type="iqr"):
    """Calculate the normalized mean-absolute error
        
        Can be normalized by 3 quantities calculated on the groundtruth:
        - iqr: 'Interquartile range'
        - range: 'Max-min range'
        - std: 'Standard deviation'
        
        Args:
            y_gt(numpy.ndarray)    :  the groundtruth values
            y_pred(numpy.ndarray)  :  the predicted values
            den_type(str)          :  the type of denominator           
        Returns:
            A float that is the value of the nmae
    """
    if den_type == "iqr":
        den = iqr(y_gt)
    elif den_type == "range":
        den = np.max(y_gt) - np.min(y_gt)
    elif den_type == "std":
        den = np.std(y_gt)
    else:
        raise ValueError("Normalized MAE can only handle iqr, range and std")
    return mean_absolute_error(y_gt,y_pred)/iqr(y_gt)

In [79]:
def one_hot_encode_feats(X,cols):
    """One hot encode feature
        Args:
            X(numpy.ndarray)                           :   the features
            cols(list)                                 :   list of column numbers of the features to be one-hot encoded
        Returns:
            A numpy.ndarray containing the encoded features
            A OneHotEncoder object       
    """
    enc = OneHotEncoder(handle_unknown="ignore",categorical_features=cols)
    encoded_feats = enc.fit_transform(X)
    return encoded_feats,enc

In [85]:
X_budget, y_budget = get_train_feats_and_gt(actors_dataset_df,"budget",["community"])
X_budget_com, y_budget_com = get_train_feats_and_gt(actors_dataset_df,"budget")
X_budget_ohe, _ = one_hot_encode_feat(X_budget_com,[0])


# X_revenue, y_revenue = get_train_feats_and_gt(actors_dataset_df,"revenue",["community"])
# X_revenue_com, y_revenue_com = get_train_feats_and_gt(actors_dataset_df,"revenue")
# X_popularity, y_popularity = get_train_feats_and_gt(actors_dataset_df,"popularity",["popularity"])
# X_popularity_com, y_popularity_com = get_train_feats_and_gt(actors_dataset_df,"popularity")



In [57]:
X_popularity_com

array([[0.00000000e+00, 5.75000000e+00, 2.26385162e+07, 1.70000000e+07,
        2.56000000e+02],
       [1.00000000e+00, 6.17500000e+00, 9.99391766e+07, 3.66525000e+07,
        7.72625000e+02],
       [0.00000000e+00, 6.50000000e+00, 1.03468236e+08, 3.75000000e+07,
        9.06750000e+02],
       ...,
       [3.00000000e+00, 6.54000000e+00, 3.13516656e+08, 7.33750000e+07,
        2.35395000e+03],
       [0.00000000e+00, 6.60000000e+00, 1.03735000e+05, 3.00000000e+06,
        3.60000000e+01],
       [1.00000000e+00, 6.17000000e+00, 6.22507540e+07, 3.26950000e+07,
        6.54800000e+02]])

## Select features

### Predict budget

In [104]:
X_train_budget, X_test_budget, y_train_budget, y_test_budget = train_test_split(X_budget,y_budget,test_size=0.2,random_state=15)
budget_lr = LinearRegression()
fit_budget_lr = budget_lr.fit(X_train_budget,y_train_budget)
y_pred_budget = fit_budget_lr.predict(X_test_budget)
print("Normalized MAE budget: "+str(nmae(y_test_budget,y_pred_budget,"range")))

Normalized MAE budget: 0.27181668044926244


### Predict budget using communities

In [105]:
X_train_budget_com, X_test_budget_com, y_train_budget_com, y_test_budget_com = train_test_split(X_budget_com,y_budget_com,test_size=0.2,random_state=15)
budget_com_lr = LinearRegression()
fit_budget_com_lr = budget_com_lr.fit(X_train_budget_com,y_train_budget_com)
y_pred_budget_com = fit_budget_com_lr.predict(X_test_budget_com)
print("Normalized MAE budget: "+str(nmae(y_test_budget_com,y_pred_budget_com,"range")))

Normalized MAE budget: 0.2530257843005383


In [100]:
X_train_budget_ohe, X_test_budget_ohe, y_train_budget_com, y_test_budget_com = train_test_split(X_budget_ohe,y_budget_com,test_size=0.2,random_state=15)
budget_com_lr = LinearRegression()
fit_budget_com_lr = budget_com_lr.fit(X_train_budget_ohe,y_train_budget_com)
y_pred_budget_com = fit_budget_com_lr.predict(X_test_budget_ohe)
print("Normalized MAE budget: "+str(nmae(y_test_budget_com,y_pred_budget_com,"range")))

Normalized MAE budget: 0.2757462996882117


### Predict budget

In [None]:
y_pred = fit_model_lr.predict(X_test)

In [None]:
print("Normalized MAE budget: "+str(nmae(y_test,y_pred)))

In [None]:
plt.scatter(y_test,y_pred)
plt.plot( [0,np.max(y_test)],[0,np.max(y_test)] ,c = 'r')

In [None]:
print(type(X_test))

### Predict popularity 

In [None]:
popularity_signal = actors_signals_df[["popularity"]].values

In [None]:
popularity_features = actors_signals_df[["budget","revenue","vote_average","vote_count"]].values

In [None]:
X_train,X_test,y_train,y_test = train_test_split(popularity_features,popularity_signal,test_size=0.2)

In [None]:
fit_model_lr = model_lr.fit(X_train,y_train)

In [None]:
y_pred = fit_model_lr.predict(X_test)

In [None]:
print("Normalized MAE popularity: "+str(nmae(y_test,y_pred)))

In [None]:
plt.scatter(y_test,y_pred)
plt.plot( [0,np.max(y_test)],[0,np.max(y_test)] ,c = 'r')

### Predict revenue

In [None]:
revenue_signal = actors_signals_df[["revenue"]].values

In [None]:
revenue_features = actors_signals_df[["budget","popularity","vote_average","vote_count"]].values

In [None]:
X_train,X_test,y_train,y_test = train_test_split(revenue_features,revenue_signal,test_size=0.2)

In [None]:
fit_model_lr = model_lr.fit(X_train,y_train)

In [None]:
y_pred = fit_model_lr.predict(X_test)

In [None]:
print("Normalized MAE revenue: "+str(nmae(y_test,y_pred)))

In [None]:
plt.scatter(y_test,y_pred)
plt.plot( [0,np.max(y_test)],[0,np.max(y_test)] ,c = 'r')