In [142]:
import numpy as np
import pandas as pd
import seaborn

# import data processing modules
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.compose import ColumnTransformer

# import model selection modules
from sklearn.model_selection import cross_val_score

# import data modeling modules
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [143]:
training_data = pd.read_csv('train.csv')

We'll begin a simple transformation pipeline: one-hot encode all categorical variables, impute median of missing continuous variables, and then standardize.

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class RobustImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.hello = 'hello'
    def fit(self, X, y):
        return self
    def transform(self, X, y=None):
        

In [149]:
y = training_data['SalePrice']
X = training_data.drop(columns=['Id', 'SalePrice'], inplace=False)

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('most_common', SimpleImputer(strategy='constant', fill_value='missing')),
    ('one_hot_encoder', OneHotEncoder())
])

cat_features = list(X.columns[X.dtypes == 'object'])
num_features = list(X.columns[X.dtypes != 'object'])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features),
    ('missing-indicator', MissingIndicator(error_on_new=False), list(X.columns))
])

X_piped = full_pipeline.fit_transform(X)

In [150]:
def cv(model, data, labels, scoring='neg_mean_squared_error', cv=10):
    """ 
    Perform cross-validation on model and return average error-rate
    """
    scores = cross_val_score(model, data, labels, scoring=scoring, cv=cv)
    return np.mean(np.sqrt(-scores))

In [151]:
def best_model(models, data, labels):
    """
    Perform cross-validation on all models in [models] and 
    return model with best cv score
    """
    model_scores = []
    for model in models:
        model_scores.append((model, cv(model, data, labels)))
    return model_scores

In [152]:
def get_best_model(model_scores):
    best_to_worst = sorted(model_scores, key = lambda x: x[1])
    best = best_to_worst[0]
    return best

In [153]:
models = [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor(), SVR()]

In [155]:
model_scores = best_model(models, X_piped, y)

In [156]:
best_model = get_best_model(model_scores)

In [157]:
test_data = pd.read_csv('test.csv')
test_ids = test_data['Id']
test_data.drop(columns=['Id'], inplace=True)
test_processed = full_pipeline.transform(test_data)
test_predictions = best_model.predict(test_processed)

ValueError: Found unknown categories ['missing'] in column 0 during transform

In [109]:
test_data.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,1459.0,1232.0,1459.0,1459.0,1459.0,1459.0,1459.0,1444.0,1458.0,1458.0,...,1458.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0
mean,57.378341,68.580357,9819.161069,6.078821,5.553804,1971.357779,1983.662783,100.709141,439.203704,52.619342,...,472.768861,93.174777,48.313914,24.243317,1.79438,17.064428,1.744345,58.167923,6.104181,2007.769705
std,42.74688,22.376841,4955.517327,1.436812,1.11374,30.390071,21.130467,177.6259,455.268042,176.753926,...,217.048611,127.744882,68.883364,67.227765,20.207842,56.609763,30.491646,630.806978,2.722432,1.30174
min,20.0,21.0,1470.0,1.0,1.0,1879.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,20.0,58.0,7391.0,5.0,5.0,1953.0,1963.0,0.0,0.0,0.0,...,318.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0
50%,50.0,67.0,9399.0,6.0,5.0,1973.0,1992.0,0.0,350.5,0.0,...,480.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,70.0,80.0,11517.5,7.0,6.0,2001.0,2004.0,164.0,753.5,0.0,...,576.0,168.0,72.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,190.0,200.0,56600.0,10.0,9.0,2010.0,2010.0,1290.0,4010.0,1526.0,...,1488.0,1424.0,742.0,1012.0,360.0,576.0,800.0,17000.0,12.0,2010.0


In [110]:
training_data.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0
