In [153]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [154]:
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor

In [155]:
def obtain_models_by_hyperopt_file(filename, model=None, score_col='score', k=1):
    df = pd.read_csv(filename)
    
    scores = []
    models = []

    for row in rf_best_df.sort_values( by=score_col ).head( k ).iterrows():
        score, max_depth, max_features, min_samples_leaf = row[1][['r2score', 'max_depth',
                                                                   'max_features', 'min_samples_leaf']]
        
        model_obj = model(max_depth=int(max_depth),
                          max_features=max_features,
                          min_samples_leaf=int(min_samples_leaf))
        
        scores.append(score)
        models.append(model_obj)
    
    return(zip(scores, models))

In [156]:
rf_models = obtain_models_by_hyperopt_file("/home/fausto/temp/hyperopt-rf.csv",
                                           model=RandomForestRegressor,
                                           score_col='r2score', k=5)

et_models = obtain_models_by_hyperopt_file("/home/fausto/temp/hyperopt-et.csv",
                                           model=ExtraTreesRegressor,
                                           score_col='r2score', k=5)

dt_models = obtain_models_by_hyperopt_file("/home/fausto/temp/hyperopt-ada.csv",
                                           model=DecisionTreeRegressor,
                                           score_col='r2score', k=5)

In [157]:
ada_models = []

for score, model_obj in dt_models:
    ada = AdaBoostRegressor(base_estimator=model_obj, n_estimators=100)
    
    ada_models.append((score, ada))

In [158]:
for rf in rf_models:
    rf[1].set_params(n_estimators=100)

In [159]:
for et in et_models:
    et[1].set_params(n_estimators=100)

In [160]:
models = rf_models + et_models + ada_models

In [161]:
df = pd.read_csv('data/manipulated_train.csv')
X = df.drop(['SalePrice'], axis=1)
y = df['SalePrice']

In [162]:
for model in models:
    model[1].fit(X, y)

In [182]:
df = pd.read_csv('data/manipulated_test.csv')
submission_df = pd.DataFrame(df['Id'].copy())
X_unseen = df.drop(['Id'], axis=1)

In [175]:
preds = None
total_score = 0

for model in models:
    if preds is None:
        preds = (1 - model[0]) * model[1].predict(X_unseen)
    else:
        preds += (1 - model[0]) * model[1].predict(X_unseen)
        
    total_score += 1 - model[0]

In [184]:
submission_df['SalePrice'] = preds / total_score

In [186]:
submission_df.to_csv('/tmp/submissions.csv', index=False)