In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE, RFECV
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from xgboost import XGBRegressor

sc = StandardScaler() # Maybe we want to bin continuos data like budget

In [3]:
df = pd.read_excel('../Data/ProcessedData/TMDB_processed.xlsx')
df = df.drop(['title', 'revenue', 'budget'], axis=1) # Using log revenue

In [4]:
features = df.dtypes[(df.columns != 'log_revenue')].index # Grab all features except that which we are trying to predict

In [5]:
cv_estimator = XGBRegressor(colsample_bytree=0.6, gamma=0.3, max_depth=4, min_child_weight=5, n_estimators=300,
                         subsample=0.8, objective='reg:squarederror')
X_train, X_test, y_train, y_test = train_test_split(df[features], df['log_revenue'], test_size=0.1, random_state=42)
cv_estimator.fit(X_train, y_train)
cv_selector = RFECV(cv_estimator, cv=5, step=1, scoring='r2')
cv_selector = cv_selector.fit(X_train, y_train)
rfecv_mask = cv_selector.get_support() # List of 1's and 0's
rfecv_features = []
for boolean, feature in zip(rfecv_mask, X_train.columns):
    if boolean:
        rfecv_features.append(feature) # Grab only features selected by RFECV
print('Optimal # of Features:', cv_selector.n_features_)
print('Best Features:', rfecv_features)



Optimal # of Features: 551
Best Features: ['release_date', 'popularity', 'id', 'runtime', 'cast_average', 'crew_average', 'Jr. ', 'Frank Welker ', 'Samuel L. Jackson ', 'Liam Neeson ', 'Robert De Niro ', 'Bruce Willis ', 'Nicolas Cage ', 'Morgan Freeman ', 'Willem Dafoe ', 'John Goodman ', 'Steve Buscemi ', 'Matt Damon ', 'Sylvester Stallone ', 'Bess Flowers ', 'Stanley Tucci ', 'Johnny Depp ', 'Richard Jenkins ', 'J.K. Simmons ', 'Alec Baldwin ', 'Julianne Moore ', 'Keith David ', 'Dennis Quaid ', 'Robert Downey Jr. ', 'Harrison Ford ', 'Tom Hanks ', 'Bill Murray ', 'Christopher Walken ', 'Brad Pitt ', 'Thomas Rosales Jr. ', 'John Hurt ', 'John Leguizamo ', 'Paul Giamatti ', 'Robert Duvall ', 'Susan Sarandon ', 'Nicole Kidman ', 'Michael Caine ', 'Ben Stiller ', 'Robin Williams ', 'Gene Hackman ', 'Denzel Washington ', 'Alfred Molina ', 'Ben Kingsley ', 'Mickie McGowan ', 'Woody Harrelson ', 'Dustin Hoffman ', 'Meryl Streep ', 'Brian Cox ', 'Ewan McGregor ', 'Harry Dean Stanton ', 'Fo