In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt
import scipy.sparse as sparse
from tqdm import tqdm

# Load in the data and make the dataframes that we will regress over

In [2]:
# Read in the clean imdb data with one-hot encoded columns
imdb_clean = pd.read_csv("Five_Actors.csv", index_col="imdb_title_id") # This takes forever to read in (It took 5m 14s on my computer). Generated by Movie_Program.py


  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
# Read in the adjusted budget and income dataframes
adjusted_budget = pd.read_csv("adjusted_budget.csv", index_col="imdb_title_id") # Generated by regularize_currency.ipynb
adjusted_income = pd.read_csv("adjusted_income.csv", index_col="imdb_title_id") # Generated by regularize_currency.ipynb

# Read in the dataframe that contains the filled metascores
filled_metascores = pd.read_csv("filled_metascores.csv", index_col="imdb_title_id") # Generated by predict_meta_score.ipynb

# Make the dataframes that we will regress over
imdb = imdb_clean.join(adjusted_budget, how="inner").drop(columns=["budget","worlwide_gross_income"]) # Include only the adjusted budget column. All dataframes will use this

imdb_metascore = imdb.join(filled_metascores, how="inner").drop(columns=["metascore"]) # We will try to predict the filled_metascore column
imdb_gross_income = imdb.join(adjusted_income, how="inner").drop(columns=["metascore"]) # We will try to predict the adjusted_income column
imdb_orig_metascore = imdb.dropna(subset=["metascore"]) # We will try to predict metascore, but will only use rows where the original metascore exists


In [4]:
imdb_gross_income

Unnamed: 0_level_0,duration,avg_vote,votes,genre_Action,genre_Adventure,genre_Animation,genre_Biography,genre_Comedy,genre_Crime,genre_Documentary,...,actors_Éric Caravaca,actors_Éric Naggar,actors_Éva Kerekes,actors_Ólafur Darri Ólafsson,actors_Óscar Jaenada,actors_Özkan Ugur,actors_Þorsteinn Bachmann,actors_Þröstur Leó Gunnarsson,adjusted_budget,adjusted_gross_income
imdb_title_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0035423,118,6.4,77852,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,7.014640e+07,1.110930e+08
tt0042192,138,8.2,117634,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1.503466e+07,1.615561e+06
tt0042208,112,7.9,23441,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1.323050e+07,3.207538e+05
tt0042332,74,7.3,142164,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,3.114323e+07,1.021568e+09
tt0042464,88,7.0,3395,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1.718247e+07,7.209121e+04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt9877340,134,5.3,173,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,7.187816e+05,4.262343e+04
tt9878242,130,6.0,202,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4.312689e+05,1.072389e+05
tt9886872,130,8.1,990,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,4.312689e+05,2.599681e+03
tt9900782,145,8.5,8400,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3.450151e+06,5.305263e+05


In [36]:
imdb_gross_income = imdb_gross_income.drop(columns=["avg_vote","votes"])
imdb_gross_income

Unnamed: 0_level_0,duration,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,...,actors_Éric Caravaca,actors_Éric Naggar,actors_Éva Kerekes,actors_Ólafur Darri Ólafsson,actors_Óscar Jaenada,actors_Özkan Ugur,actors_Þorsteinn Bachmann,actors_Þröstur Leó Gunnarsson,adjusted_budget,adjusted_gross_income
imdb_title_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0035423,118,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,7.014640e+07,1.110930e+08
tt0042192,138,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1.503466e+07,1.615561e+06
tt0042208,112,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,1.323050e+07,3.207538e+05
tt0042332,74,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,3.114323e+07,1.021568e+09
tt0042464,88,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1.718247e+07,7.209121e+04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt9877340,134,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,7.187816e+05,4.262343e+04
tt9878242,130,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,4.312689e+05,1.072389e+05
tt9886872,130,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,4.312689e+05,2.599681e+03
tt9900782,145,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3.450151e+06,5.305263e+05


In [22]:
imdb_orig_metascore

Unnamed: 0_level_0,duration,avg_vote,votes,metascore,Action,Adventure,Animation,Biography,Comedy,Crime,...,actors_Éric Bougnon,actors_Éric Caravaca,actors_Éric Naggar,actors_Éva Kerekes,actors_Ólafur Darri Ólafsson,actors_Óscar Jaenada,actors_Özkan Ugur,actors_Þorsteinn Bachmann,actors_Þröstur Leó Gunnarsson,adjusted_budget
imdb_title_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0035423,118,6.4,77852,44.0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,7.014640e+07
tt0042192,138,8.2,117634,98.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.503466e+07
tt0042200,107,6.9,4299,77.0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,4.047315e+07
tt0042332,74,7.3,142164,85.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,3.114323e+07
tt0042530,87,7.6,11210,74.0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,4.295618e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt9426210,112,7.6,16277,72.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1.123694e+07
tt9482230,115,6.0,549,66.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.399857e+06
tt9611484,127,7.5,117,36.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.153401e+05
tt9626278,94,6.9,303,79.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9.617200e+04


# Perform Regression

In [168]:
def linear_regress(df: pd.DataFrame, target_col: str):
    """
    Takes a dataframe and does linear regression on it.

    Parameters:
        df (pd.DataFrame): The dataframe to use
        target_col (str): The name of the column to predict

    Returns:
        Something (the coefficients? The model itself?  Idk)
    """
    # Split into X and y
    Y = df[target_col] # Get the target column
    X = sparse.csr_matrix(df.drop(columns=[target_col]).values) # Convert the dataframe to a sparse matrix

    # Train-test split (70-30)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.3)

    # Make the model
    ols = LinearRegression(fit_intercept=True) # Make sure it adds a column  of ones
    
    # Fit the model
    ols.fit(X_train, Y_train)
    
    print(ols.score(X_test, Y_test))

In [41]:
linear_regress(imdb_gross_income, "adjusted_gross_income")

0.45321242750977364


In [161]:
def tree_regress(df: pd.DataFrame, target_col: str):
    """
    Takes a dataframe and does linear regression on it.

    Parameters:
        df (pd.DataFrame): The dataframe to use
        target_col (str): The name of the column to predict

    Returns:
        Something (the coefficients? The model itself?  Idk)
    """
    # Split into X and y
    Y = df[target_col] # Get the target column
    X = sparse.csr_matrix(df.drop(columns=[target_col]).values) # Convert the dataframe to a sparse matrix

    # Train-test split (70-30)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.3)

    # Make the model
    tree = DecisionTreeRegressor(min_samples_split=150, splitter="random")
    
    # Fit the model
    tree.fit(X_train, Y_train)

    print(tree.score(X_test, Y_test))
    

In [158]:
tree_regress(imdb_orig_metascore, "metascore")

0.5354774268184315
{'min_samples_split': 170, 'splitter': 'random'}


In [169]:
tree_regress(imdb[["avg_vote","votes","metascore"]].dropna(), "metascore")

0.5477957147732591


In [4]:
def forest_regress(df: pd.DataFrame, target_col: str):
    """
    Takes a dataframe and does linear regression on it.

    Parameters:
        df (pd.DataFrame): The dataframe to use
        target_col (str): The name of the column to predict

    Returns:
        Something (the coefficients? The model itself?  Idk)
    """
    # Split into X and y
    Y = df[target_col] # Get the target column
    X = sparse.csr_matrix(df.drop(columns=[target_col]).values) # Convert the dataframe to a sparse matrix

    # Train-test split (70-30)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.3)

    # Make the model
    forest = RandomForestRegressor(min_samples_split=190, n_jobs=-1)
    # grid = GridSearchCV(forest, param_grid={"min_samples_split":range(2,200,2)}, n_jobs=-1)
    
    # Fit the model
    # grid.fit(X, Y)
    forest.fit(X_train, Y_train)

    # print(grid.best_score_)
    # print(grid.best_params_)
    print(forest.score(X_test, Y_test))

In [5]:
for _ in range(10):
    forest_regress(imdb[["avg_vote","votes","metascore"]].dropna(), "metascore")

0.5378308411646783
{'min_samples_split': 180}
0.5376671929053158
{'min_samples_split': 196}
0.5376229125330316
{'min_samples_split': 186}
0.5377156888798211
{'min_samples_split': 190}
0.5378846154824686
{'min_samples_split': 198}
0.5376670450101199
{'min_samples_split': 194}
0.5379623888831911
{'min_samples_split': 192}
0.5378177453258159
{'min_samples_split': 198}
0.5376237061226458
{'min_samples_split': 188}
0.5378555745348119
{'min_samples_split': 190}


In [7]:
forest_regress(imdb_gross_income, "adjusted_gross_income")