In [1]:
import pandas as pd
import datetime
import time
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split

class MyTimer():
    def __init__(self):
        self.start = time.time()
    def __enter__(self):
        return self
    def __exit__(self, exc_type, exc_val, exc_tb):
        end = time.time()
        runtime = end - self.start
        msg = 'The function took {time} seconds to complete'
        print(msg.format(time=runtime))


In [2]:
X = pd.read_csv('train.csv')

y = X['y'].values
X.drop(['ID', 'y'], axis=1, inplace=True)
X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=None, shuffle=True)

model = ExtraTreesRegressor(n_estimators=100, n_jobs=4, min_samples_split=25, criterion='mse',
                            min_samples_leaf=35, max_features=200)

print("Fitting Model with Train data and default parameters:")
with MyTimer():                            
    model.fit(X_train, y_train)


Fitting Model with Train data and default parameters:
The function took 3.785903215408325 seconds to complete


In [3]:
print(model.score(X_train, y_train))

0.6027001702154409


In [4]:
model.get_params

<bound method BaseEstimator.get_params of ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=None, max_features=200, max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=35,
                    min_samples_split=25, min_weight_fraction_leaf=0.0,
                    n_estimators=100, n_jobs=4, oob_score=False,
                    random_state=None, verbose=0, warm_start=False)>

<pre>
possible list of parameters: 
model.get_params
lt& bound method BaseEstimator.get_params of ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=None, max_features=200, max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=35,
                    min_samples_split=25, min_weight_fraction_leaf=0.0,
                    n_estimators=100, n_jobs=4, oob_score=False,
                    random_state=None, verbose=0, warm_start=False)>'''

In [5]:
gsc = GridSearchCV(
    estimator=model,
    param_grid={
        #'n_estimators': range(50,126,25),
        'max_features': range(50,401,50),
        #'max_features': [50,100], # can be list or range or other
        #'criterion':['mse','mae']
        #min_samples_leaf': range(20,50,5),
        #min_samples_split': range(15,36,5),
    },
    scoring='r2',
    #scoring='neg_mean_squared_error', # or look here for other choices 
    # https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
    cv=5
)

print("Running GridSearchCV:")
with MyTimer():    
    grid_result = gsc.fit(X_train, y_train)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

#for test_mean, train_mean, param in zip(
for test_mean, param in zip(
        grid_result.cv_results_['mean_test_score'],
        #grid_result.cv_results_['mean_train_score'],
        grid_result.cv_results_['params']):
    print("Test : %f with: %r" % (test_mean, param))
    
model = ExtraTreesRegressor(**grid_result.best_params_)

print("Fitting Model with GridSearch and Cross Validation:")
model.fit(X_train, y_train)

print(model.score(X_train, y_train))

#df_sub = pd.DataFrame({'ID': id_test, 'y': model.predict(test)})
#df_sub.to_csv('mercedes-submission.csv', index=False)

Running GridSearchCV:
The function took 131.70337557792664 seconds to complete
Best: 0.565444 using {'max_features': 150}
Test : 0.560649 with: {'max_features': 50}
Test : 0.564362 with: {'max_features': 100}
Test : 0.565444 with: {'max_features': 150}
Test : 0.564576 with: {'max_features': 200}
Test : 0.563906 with: {'max_features': 250}
Test : 0.562400 with: {'max_features': 300}
Test : 0.561442 with: {'max_features': 350}
Test : 0.559104 with: {'max_features': 400}
Fitting Model with GridSearch and Cross Validation:
0.9783994053044038


In [6]:
print("True Model Score:")
print(model.score(X_test, y_test))

True Model Score:
0.4346292637881207


In [7]:
grid_result

GridSearchCV(cv=5, error_score=nan,
             estimator=ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0,
                                           criterion='mse', max_depth=None,
                                           max_features=200,
                                           max_leaf_nodes=None,
                                           max_samples=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                           min_samples_leaf=35,
                                           min_samples_split=25,
                                           min_weight_fraction_leaf=0.0,
                                           n_estimators=100, n_jobs=4,
                                           oob_score=False, random_state=None,
                                           verbose=0, warm_start=False),
             iid='deprecated', n_jobs=None,
             param_g