In [12]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt
import scipy.sparse as sparse
from tqdm import tqdm

# Read in the data

In [11]:
filtered_data = pd.read_csv("IMDb movies.csv")
# Only include columns we want
filtered_data = filtered_data[['imdb_title_id', 'genre', 'duration', 'director', 'writer', 'production_company', 'actors', 'avg_vote', 'votes', 'budget', 'worlwide_gross_income', 'metascore']]
# Drop columns with important missing info
filtered_data = filtered_data.dropna(subset=['imdb_title_id', 'genre', 'duration', 'director', 'writer', 'production_company', 'actors', 'avg_vote', 'votes', 'budget'])
filtered_data = filtered_data.set_index("imdb_title_id")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
# To do the prediction, we only care about avg_vote, votes, and metascore
filtered_data = filtered_data[["avg_vote","votes","metascore"]]
filtered_data

Unnamed: 0_level_0,avg_vote,votes,metascore
imdb_title_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0000574,6.1,589,
tt0002101,5.2,446,
tt0002445,6.2,273,
tt0002452,6.7,198,
tt0002461,5.5,225,
...,...,...,...
tt9890308,7.2,214,
tt9894394,7.9,440,
tt9900782,8.5,8400,
tt9905412,7.4,494,


# Separate the wheat and the tares (has metascore or not)

In [6]:
wheat = filtered_data.dropna(subset=["metascore"])
tares = filtered_data.loc[filtered_data["metascore"].isna(),:]

In [7]:
wheat

Unnamed: 0_level_0,avg_vote,votes,metascore
imdb_title_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0006864,7.8,13875,99.0
tt0017136,8.3,156076,98.0
tt0018037,6.5,8866,66.0
tt0018773,8.1,27414,90.0
tt0019777,7.0,6900,69.0
...,...,...,...
tt9426210,7.6,16277,72.0
tt9482230,6.0,549,66.0
tt9611484,7.5,117,36.0
tt9626278,6.9,303,79.0


In [8]:
tares

Unnamed: 0_level_0,avg_vote,votes,metascore
imdb_title_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0000574,6.1,589,
tt0002101,5.2,446,
tt0002445,6.2,273,
tt0002452,6.7,198,
tt0002461,5.5,225,
...,...,...,...
tt9890308,7.2,214,
tt9894394,7.9,440,
tt9900782,8.5,8400,
tt9905412,7.4,494,


In [10]:
def forest_regress(df: pd.DataFrame, target_col: str):
    """
    Takes a dataframe and does linear regression on it.

    Parameters:
        df (pd.DataFrame): The dataframe to use
        target_col (str): The name of the column to predict

    Returns:
        Something (the coefficients? The model itself?  Idk)
    """
    # Split into X and y
    Y = df[target_col] # Get the target column
    # X = sparse.csr_matrix(df.drop(columns=[target_col]).values) # Convert the dataframe to a sparse matrix
    X = df.drop(columns=[target_col]).values

    # Train-test split (70-30)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.3)

    # Make the model
    forest = RandomForestRegressor(min_samples_split=190, n_jobs=-1)
    # grid = GridSearchCV(forest, param_grid={"min_samples_split":range(2,200,2)}, n_jobs=-1)
    
    # Fit the model
    # grid.fit(X, Y)
    forest.fit(X_train, Y_train)

    # print(grid.best_score_)
    # print(grid.best_params_)
    print(forest.score(X_test, Y_test))
    
    # Return the forest
    return forest

In [15]:
forest = forest_regress(wheat, target_col="metascore")

metascore_prediction = forest.predict(tares.drop(columns="metascore").values)

0.5820702461683883


In [19]:
tares.loc[:,'filled_metascore'] = metascore_prediction
tares

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


Unnamed: 0_level_0,avg_vote,votes,metascore,filled_metascore
imdb_title_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tt0000574,6.1,589,,53.543208
tt0002101,5.2,446,,38.817058
tt0002445,6.2,273,,54.600668
tt0002452,6.7,198,,58.278441
tt0002461,5.5,225,,45.374474
...,...,...,...,...
tt9890308,7.2,214,,61.279039
tt9894394,7.9,440,,78.674199
tt9900782,8.5,8400,,81.182528
tt9905412,7.4,494,,63.427847


In [21]:
wheat['filled_metascore'] = wheat["metascore"]
wheat

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wheat['filled_metascore'] = wheat["metascore"]


Unnamed: 0_level_0,avg_vote,votes,metascore,filled_metascore
imdb_title_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tt0006864,7.8,13875,99.0,99.0
tt0017136,8.3,156076,98.0,98.0
tt0018037,6.5,8866,66.0,66.0
tt0018773,8.1,27414,90.0,90.0
tt0019777,7.0,6900,69.0,69.0
...,...,...,...,...
tt9426210,7.6,16277,72.0,72.0
tt9482230,6.0,549,66.0,66.0
tt9611484,7.5,117,36.0,36.0
tt9626278,6.9,303,79.0,79.0


In [24]:
filled_metascores = pd.concat((wheat, tares))
filled_metascores

Unnamed: 0_level_0,avg_vote,votes,metascore,filled_metascore
imdb_title_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tt0006864,7.8,13875,99.0,99.000000
tt0017136,8.3,156076,98.0,98.000000
tt0018037,6.5,8866,66.0,66.000000
tt0018773,8.1,27414,90.0,90.000000
tt0019777,7.0,6900,69.0,69.000000
...,...,...,...,...
tt9890308,7.2,214,,61.279039
tt9894394,7.9,440,,78.674199
tt9900782,8.5,8400,,81.182528
tt9905412,7.4,494,,63.427847


In [26]:
filled_metascores[["filled_metascore"]].to_csv("filled_metascores_forest.csv")