In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [56]:
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor

In [128]:
def obtain_models_by_hyperopt_file(filename, model=None, score_col='score', k=1):
    df = pd.read_csv(filename)
    
    scores = []
    models = []

    for row in rf_best_df.sort_values( by=score_col ).head( k ).iterrows():
        score, max_depth, max_features, min_samples_leaf = row[1][['r2score', 'max_depth',
                                                                   'max_features', 'min_samples_leaf']]
        
        model_obj = model(max_depth=int(max_depth),
                          max_features=max_features,
                          min_samples_leaf=int(min_samples_leaf))
        
        scores.append(score)
        models.append(model_obj)
    
    return(zip(scores, models))

In [129]:
rf_models = obtain_models_by_hyperopt_file("/home/fausto/temp/hyperopt-rf.csv",
                                           model=RandomForestRegressor,
                                           score_col='r2score', k=5)

et_models = obtain_models_by_hyperopt_file("/home/fausto/temp/hyperopt-et.csv",
                                           model=ExtraTreesRegressor,
                                           score_col='r2score', k=5)

dt_models = obtain_models_by_hyperopt_file("/home/fausto/temp/hyperopt-ada.csv",
                                           model=DecisionTreeRegressor,
                                           score_col='r2score', k=5)

In [130]:
ada_models = []

for score, model_obj in dt_models:
    ada = AdaBoostRegressor(base_estimator=model_obj, n_estimators=100)
    
    ada_models.append((score, ada))

In [131]:
for rf in rf_models:
    rf[1].set_params(n_estimators=100)

In [132]:
for et in et_models:
    et[1].set_params(n_estimators=100)

In [133]:
models = rf_models + et_models + ada_models

In [134]:
df = pd.read_csv('data/manipulated_train.csv')
X = df.drop(['SalePrice'], axis=1)
y = df['SalePrice']

In [135]:
for model in models:
    model[1].fit(X, y)

In [137]:
df = pd.read_csv('data/manipulated_test.csv')
X_unseen = df

In [143]:
models[0][1].predict(X_unseen)

array([ 11.70884292,  11.94646479,  12.09193467, ...,  11.92129225,
        11.64692303,  12.35590864])

In [147]:
(1 - models[0][0]) * models[0][1].predict(X_unseen)

array([ 10.43390544,  10.64565344,  10.77528358, ...,  10.62322185,
        10.3787278 ,  11.01051429])

In [148]:
preds = None
total_score = 0

for model in models:
    if preds is None:
        preds = (1 - model[0]) * model[1].predict(X_unseen)
    else:
        preds += (1 - model[0]) * model[1].predict(X_unseen)
        
    total_score += 1 - model[0]

In [151]:
preds / total_score

array([ 11.72584038,  11.93108549,  12.08993841, ...,  11.92977201,
        11.63445347,  12.348768  ])