In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder
%matplotlib inline

In [2]:
df = pd.read_pickle("final_df.pkl")

In [3]:
df.drop([
 'ticker',
 'date',
 'calendardate',
 'name',
 'rating_cnt_strong_buys',
 'rating_cnt_mod_buys',
 'rating_cnt_holds',
 'rating_cnt_mod_sells',
 'rating_cnt_strong_sells',
 'rating_cnt_with',
 'rating_cnt_without',
 'rating_change',
 'quart',
 'year',
 'industry'
 ], axis=1, inplace=True)

In [4]:
one_hot_features = ['exchange', 'sector']

In [5]:
for feature in one_hot_features:
    cat_X = df.loc[:, [feature]]
    ohe = OneHotEncoder(drop='first', sparse=False)
    ohe.fit(cat_X)
    ohe_X = ohe.transform(cat_X)
    columns = ohe.get_feature_names([feature])
    ohe_X_df = pd.DataFrame(ohe_X, columns=columns, index=cat_X.index)
    df = df.join(ohe_X_df)

In [6]:
df.drop([
 'exchange',
 'sector'  
 ], axis=1, inplace=True)

In [7]:
df_X = df.copy()

In [8]:
del df_X["rating_mean_recom"]

In [9]:
X = df_X

y = df["rating_mean_recom"]

In [10]:
#Split the data 80-20 train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=15)

### Simple Model

In [None]:
rf = RandomForestRegressor(criterion ='mae')
rf.fit(X_train, y_train)

In [None]:
print(rf.score(X_test, y_test))

In [None]:
test_set_pred = rf.predict(X_te)

In [None]:
mae(y_test, test_set_pred)

In [None]:
filename = 'randomforest.pkl'
pickle.dump(rf, open(filename, 'wb'))

### RandomizedCV

In [11]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [12]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor(criterion ='mae')
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


KeyboardInterrupt: 

In [None]:
#view best parameters
rf_random.best_params_

In [None]:
print(rf_random.best_score_)
print(rf_random.score(X_test, y_test))

In [None]:
test_set_pred = rf_random.predict(X_te)

In [None]:
def mae(y_true, y_pred):
    return np.mean(np.abs(y_pred - y_true))

In [None]:
mae(y_test, test_set_pred)

In [None]:
filename = 'randomforest_randomcv.pkl'
pickle.dump(rf_random, open(filename, 'wb'))