In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from math import sqrt
import numpy as np
import pandas as pd

In [2]:
reduced = pd.read_csv("../data/reduced_noot.csv")

In [3]:
reduced = reduced.drop(columns = reduced.columns[0])

In [4]:
# reduced.scores = reduced.scores ** .5
# reduced.tmhalfsc = reduced.tmhalfsc ** .5
# reduced.predscores = reduced.predscores ** .5

In [5]:
base_X = reduced.drop(columns='scores')
# base_X = reduced[['predscores','tmhalfsc','opphalfsc','airya','yaca','psd']]
base_y = reduced["scores"]

In [6]:
def rmse(pred, y):
    n = len(y)
    RMSE = sqrt(sum((pred - y)**2) / n)
    return RMSE

In [7]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

In [8]:
grid_params = dict(min_samples_leaf=np.linspace(.005,.015,num=4), n_estimators=range(90,101,5))
gs = GridSearchCV(estimator=RandomForestRegressor(), param_grid=grid_params, cv=8, scoring='r2')
gs.fit(base_X, base_y)
gs.best_params_

KeyboardInterrupt: 

In [None]:
grid_params = dict(min_samples_leaf=np.linspace(.01,.1,num=10))
gs = GridSearchCV(estimator=DecisionTreeRegressor(), param_grid=grid_params, cv=10, scoring='neg_mean_squared_error')
gs.fit(base_X, base_y)
gs.best_params_

In [None]:
def fit_Random_Cactus(X, y, best_params):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    base_rf = RandomForestRegressor(**best_params)
    base_rf.fit(X_train, y_train)
    base_rf_pred = base_rf.predict(X_test)
    base_rf_RMSE = rmse(base_rf_pred, y_test)
    print(f"Decision Tree Regressor RMSE: {base_rf_RMSE:.3f}")
    print(f"Random Forest Regressor R^2: {base_rf.score(X_test, y_test):.3f}")
    
    leaf_id_trees = base_rf.apply(X_train)
    regressors = dict()
    for i, leaves in enumerate(leaf_id_trees.transpose()):
        regressors_tree = dict()
        leaf_nodes = set(leaves)
        for leaf in leaf_nodes:
            leaf_sample_X = X_train[leaves==leaf]
            leaf_sample_y = y_train[leaves==leaf]
            leaf_regression = Lasso()
            leaf_regression.fit(leaf_sample_X, leaf_sample_y)
            regressors_tree[leaf] = leaf_regression
        regressors[i] = regressors_tree
    
    leaf_id_test_trees = base_rf.apply(X_test)
    y_pred = [[] for _ in range(leaf_id_test_trees.shape[0])]
    for i, leaves in enumerate(leaf_id_test_trees.transpose()):
        for j, leaf in enumerate(leaves):
            leaf_regression = regressors[i][leaf]
            pred_j = leaf_regression.predict(pd.DataFrame(X_test.iloc[j,:]).transpose())
            y_pred[j].append(pred_j[0])
    y_pred = [sum(row)/len(row) for row in y_pred]
        
    print(f"Random Cactus RMSE: {rmse(y_pred, y_test):.3f}")
    print(f"Random Cactus R^2: {r2_score(y_test, y_pred):.3f}")

In [None]:
def fit_Lasso_tree(X, y, best_params):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    base_dt = DecisionTreeRegressor(**best_params)
    base_dt.fit(X_train, y_train)
    base_dt_pred = base_dt.predict(X_test)
    base_dt_RMSE = rmse(base_dt_pred, y_test)
    print(f"Decision Tree Regressor RMSE: {base_dt_RMSE:.3f}")
    
    leaf_id = base_dt.apply(X_train)
    regressors = dict()
    leaf_nodes = set(leaf_id)
    for leaf in leaf_nodes:
        leaf_sample_X = X_train[leaf_id==leaf]
        leaf_sample_y = y_train[leaf_id==leaf]
        leaf_regression = Lasso()
        leaf_regression.fit(leaf_sample_X, leaf_sample_y)
        regressors[leaf] = leaf_regression
    
    leaf_id_test = base_dt.apply(X_test)
    y_pred = []
    for i, leaf in enumerate(leaf_id_test):
        leaf_regression = regressors[leaf]
        pred_i = leaf_regression.predict(pd.DataFrame(X_test.iloc[i,:]).transpose())
        y_pred.append(pred_i[0])
        
    print(f"Lasso Tree RMSE: {rmse(y_pred, y_test):.3f}")
    print(f"Lasso Tree R^2: {r2_score(y_test, y_pred):.3f}")

In [None]:
fit_Lasso_tree(base_X, base_y, {'min_samples_leaf':100})
fit_Lasso_tree(base_X, base_y, gs.best_params_)

In [None]:
# fit_Random_Cactus(base_X, base_y, {'min_samples_leaf':75, 'n_estimators':40})
fit_Random_Cactus(base_X, base_y, gs.best_params_)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(base_X, base_y, test_size=0.25)
lasso = LinearRegression()
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)
rmse(y_pred, y_test)

In [None]:
lasso.coef_

In [None]:
X_test.shape

In [None]:
"""Hand Picked Variables from R:
predscores+tmhalfsc+patt+ypa+ratt+ypr+sackyds"""
manual = reduced[["scores", "predscores", "tmhalfsc", "patt", "ypa", "ratt", "ypr", "sackyds"]]

In [None]:
man_X = manual.drop(columns="scores")

In [None]:
man_y = manual["scores"]

In [None]:
man_X_train, man_X_test, man_y_train, man_y_test = train_test_split(man_X, man_y, test_size=0.25)

In [None]:
man_rf = RandomForestRegressor(n_estimators=100)
man_rf.fit(man_X_train, man_y_train)

In [None]:
man_rf_pred = man_rf.predict(man_X_test)

In [None]:
man_rf_RMSE = rmse(man_rf_pred, man_y_test)
man_rf_r2 = man_rf.score(man_X_test, man_y_test)
print(f"Manual Random Forest Regressor RMSE: {man_rf_RMSE:.3f}")
print(f"Manual Random Forest Regressor R^2: {man_rf_r2:.3f}")

In [None]:
man_dt = DecisionTreeRegressor()
man_dt.fit(man_X_train, man_y_train)

In [None]:
man_dt_pred = man_dt.predict(man_X_test)

In [None]:
man_dt_RMSE = rmse(man_dt_pred, man_y_test)
man_dt_r2 = man_dt.score(man_X_test, man_y_test)
print(f"Manual Decision Tree Regressor RMSE: {man_dt_RMSE:.3f}")
print(f"Manual Decision Tree Regressor R^2: {man_dt_r2:.3f}")

## Grid Search for Manual Random Forest Regressor

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold

In [None]:
rf = RandomForestRegressor()
search = {"criterion":["mse"], "n_estimators":list(range(95, 105, 1)), 
         "max_depth":[5], "max_features":["auto"]}

In [None]:
rf_gs = GridSearchCV(rf, search, cv=10)

In [None]:
rf_gs.fit(man_X_train, man_y_train)

In [None]:
best_prms = rf_gs.best_params_
for param in best_prms:
    print(param, " -> ", best_prms[param])

In [None]:
best_rf = RandomForestRegressor(criterion="mse", max_depth=5, max_features="auto", n_estimators=97)
best_rf.fit(man_X_train, man_y_train)

In [None]:
best_rf_pred = best_rf.predict(man_X_test)

In [None]:
best_rf_RMSE = rmse(best_rf_pred, man_y_test)
best_rf_r2 = best_rf.score(man_X_test, man_y_test)
print(f"Best Random Forest Regressor RMSE: {best_rf_RMSE:.3f}")
print(f"Best Random Forest Regressor R^2: {best_rf_r2:.3f}")

## Transformation of tmhalfsc

In [None]:
reset -fs

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from math import sqrt
import numpy as np
import pandas as pd

In [None]:
reduced = pd.read_csv("../data/reduced.csv")
reduced = reduced.drop(columns = reduced.columns[0])

In [None]:
X = reduced.drop(columns="scores")
y = reduced["scores"]

In [None]:
tmhalf = np.array(X.tmhalfsc)

In [None]:
sqrt_tmhalf = np.sqrt(tmhalf)
log_tmhalf = []
for i in range(len(tmhalf)):
    if tmhalf[i] != 0:
        log_tmhalf.append(np.log(tmhalf[i]))
    else:
        log_tmhalf.append(tmhalf[i])

In [None]:
X = X.drop(columns="tmhalfsc")
X["tmhalfsc"] = sqrt_tmhalf
# X["tmhalfsc"] = log_tmhalf

In [None]:
def rmse(pred, y):
    n = len(y)
    RMSE = np.sqrt(sum((pred - y)**2) / n)
    return RMSE

In [None]:
rmse_scores = []
r2_scores = []
rand_state = range(1,100)
for i in rand_state:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=i)
    rf = RandomForestRegressor(n_estimators=97, criterion="mse", max_depth=5, max_features="auto")
    rf.fit(X_train, y_train)
    pred = rf.predict(X_test)
    r2_scores.append(rf.score(X_test, y_test))
    rmse_scores.append(rmse(pred, y_test))

In [None]:
scores = list(zip(rand_state, rmse_scores, r2_scores))

In [None]:
best_state = sorted(scores, key=lambda x: x[1])[0]
print("Models sorted on RMSE ascending:")
print(f"Random State: {best_state[0]}")
print(f"RMSE: {best_state[1]:.4f}")
print(f"R^2: {best_state[2]:.4f}")

In [None]:
print(f"Mean RMSE: {np.mean(rmse_scores)}")
print(f"Median RMSE: {np.median(rmse_scores)}")
print(f"Mean R^2: {np.mean(r2_scores)}")
print(f"Median R^2: {np.median(r2_scores)}")

In [None]:
man_X = X[["predscores", "tmhalfsc", "patt", "ypa", "ratt", "ypr", "sackyds"]]
man_y = y

In [None]:
man_rmse_scores = []
man_r2_scores = []
for i in rand_state:
    man_X_train, man_X_test, man_y_train, man_y_test = train_test_split(man_X, man_y, test_size=0.25, random_state=i)
    man_rf = RandomForestRegressor(n_estimators=97, criterion="mse", max_depth=5, max_features="auto")
    man_rf.fit(man_X_train, man_y_train)
    man_pred = man_rf.predict(man_X_test)
    man_r2_scores.append(man_rf.score(man_X_test, man_y_test))
    man_rmse_scores.append(rmse(man_pred, man_y_test))

In [None]:
man_scores = list(zip(rand_state, man_rmse_scores, man_r2_scores))

In [None]:
best_state = sorted(man_scores, key=lambda x: x[1])[0]
print("Manual models sorted on RMSE ascending:")
print(f"Random State: {best_state[0]}")
print(f"Manual RMSE: {best_state[1]:.4f}")
print(f"Manual R^2: {best_state[2]:.4f}")

In [None]:
print(f"Manual mean RMSE: {np.mean(man_rmse_scores)}")
print(f"Manual median RMSE: {np.median(man_rmse_scores)}")
print(f"Manual mean R^2: {np.mean(man_r2_scores)}")
print(f"Manual median R^2: {np.median(man_r2_scores)}")