## Does normalizing the features make a difference for RF
This notebook is about settling an argument between me and Megan. Megan won this round.

Here we run-off PowerTransformer and StandardScaler against no transformation. The results are more or less identical.

In [4]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer, StandardScaler

analysis_dir = "/home/isaac/Continuosity/MESS/analysis/parameter-estimation-cv/"
SIMOUT = "{}/SIMOUT.txt".format(analysis_dir)
sim_df = pd.read_csv(SIMOUT, sep="\t", header=0)
sim_df[:2]

Unnamed: 0,S_m,J_m,birth_rate,death_proportion,trait_rate_meta,ecological_strength,generations,community_assembly_model,speciation_model,mutation_rate,...,SGD_0,SGD_1,SGD_2,SGD_3,SGD_4,SGD_5,SGD_6,SGD_7,SGD_8,SGD_9
0,100,750000,2.0,0.7,2.0,5.0,0.25,neutral,point_mutation,2.2e-08,...,8.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,1.0
1,100,750000,2.0,0.7,2.0,5.0,0.25,neutral,point_mutation,2.2e-08,...,8.0,2.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [5]:

sim_df = sim_df.dropna()
features = [x for x in sim_df.columns if "_h" in x]
targets = ["speciation_rate", "m"]
X = sim_df[features]
y = sim_df[targets]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y)

rf = RandomForestRegressor(**{'bootstrap': True,
                             'max_depth': 80,
                             'min_samples_leaf': 2,
                             'min_samples_split': 10,
                             'n_estimators': 1800})
for trans in ["pow", "std", "none"]:
    if trans == "pow":
        pt = PowerTransformer()
        #Xtrain = pt.fit_transform(Xtrain)
        #Xtest = pt.fit_transform(Xtest)
        pipe = Pipeline(steps=[('power', pt), ('rf', rf)])
    elif trans == "std":
        ss = StandardScaler()
        #Xtrain = ss.fit_transform(Xtrain)
        #Xtest = ss.fit_transform(Xtest)
        pipe = Pipeline(steps=[('std', ss), ('rf', rf)])
    else:
        pipe = Pipeline(steps=[('rf', rf)])

    scores = cross_val_score(pipe, X, y, cv=4)
    print(trans, scores)

('pow', array([0.13722161, 0.14889039, 0.12477289, 0.17383785]))
('std', array([0.13945981, 0.14723101, 0.12624056, 0.17520341]))
('none', array([0.14054659, 0.14976201, 0.12567059, 0.17055688]))
