In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from collections import defaultdict
import math
import re

# Load the data

In [3]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

In [4]:
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [6]:
train_df.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [7]:
# Get the columnas that have nan values
columns_without_nans = []
columns_with_nans = []
for column in train_df.columns:
    num_nans = train_df[column].isnull().values.sum()
    if num_nans > 0:
        print("{} (type {}): {} nans".format(column, train_df[column].dtype, num_nans))
        columns_with_nans.append(column)
    else:
        columns_without_nans.append(column)

LotFrontage (type float64): 259 nans
Alley (type object): 1369 nans
MasVnrType (type object): 8 nans
MasVnrArea (type float64): 8 nans
BsmtQual (type object): 37 nans
BsmtCond (type object): 37 nans
BsmtExposure (type object): 38 nans
BsmtFinType1 (type object): 37 nans
BsmtFinType2 (type object): 38 nans
Electrical (type object): 1 nans
FireplaceQu (type object): 690 nans
GarageType (type object): 81 nans
GarageYrBlt (type float64): 81 nans
GarageFinish (type object): 81 nans
GarageQual (type object): 81 nans
GarageCond (type object): 81 nans
PoolQC (type object): 1453 nans
Fence (type object): 1179 nans
MiscFeature (type object): 1406 nans


In [8]:
categorical_columns = []
for column in columns_without_nans:
    column_type = train_df[column].dtype
    if column_type == "object":
        categorical_columns.append(column)

In [9]:
print("Categorical columns: {}".format(categorical_columns))

Categorical columns: ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive', 'SaleType', 'SaleCondition']


In [26]:
# Fill the categorical columns, creating dummy (1/0) columns
expanded_train_df = pd.get_dummies(train_df, dummy_na=True)
expanded_test_df = pd.get_dummies(test_df, dummy_na=True)

In [27]:
# In case any column of test is not present in train, set it to zero
all_columns = set(expanded_train_df.columns).union(set(expanded_test_df.columns)) - set(["SalePrice"])
for column in all_columns:
    if column not in expanded_train_df.columns:
        expanded_train_df[column] = 0
    if column not in expanded_test_df.columns:
        expanded_test_df[column] = 0

In [37]:
# For each NAN fill it to the mean value of that column (TODO: check median value)
final_train_df = expanded_train_df.fillna(expanded_train_df.median())
final_test_df = expanded_test_df.fillna(expanded_test_df.median())

In [38]:
# Prepare input data to regressors
y = final_train_df["SalePrice"].values

X = final_train_df.drop("SalePrice", axis=1).values

test_x = final_test_df.values

In [39]:
print y.shape
print X.shape
print test_x.shape

(1460,)
(1460, 332)
(1459, 332)


In [40]:
# Usefull runner
class RegressorRunner(object):
    
    def __init__(self, pipeline, parameters, cv=5, debug=False):
        self.pipeline = pipeline
        self.parameters = parameters
        self.grid_search = GridSearchCV(self.pipeline, self.parameters, cv=cv)
        self.debug = debug
        
    def fit(self, X, y):
        self.grid_search.fit(X, y)

    @property
    def best_params(self):
        return self.grid_search.best_params_
        
    def get_scores(self, X, y, num_folds=5):
        scores = cross_val_score(self.grid_search.best_estimator_, X, y, cv=num_folds)
        return scores
    
    @property
    def feature_importances(self):
        classifier_step_index = 0
        for step_name, step_process in self.grid_search.best_estimator_.steps:
            if step_name == "regressor":
                break
            classifier_step_index += 1
        feature_importances = self.grid_search.best_estimator_.steps[classifier_step_index][1].feature_importances_
        return sorted(zip(feature_importances, selected_features), reverse=True)
    
    def predict(self, X_test):
        prediction = self.grid_search.predict(X_test)
        return prediction

    def apply_predicition_to_df(self, X_test, test_df, output_filename):
        prediction = self.predict(X_test)
        # Add the prediction to the test dataset
        estimated_test_df = test_df.assign(SalePrice=list(prediction))
        # Save 
        estimated_test_df.to_csv(output_filename, columns=["Id", "SalePrice"], index=False)

In [41]:
# Linear regressor
pipeline = Pipeline([
    ("regressor", LinearRegression())
])

parameters = { 
    'regressor__fit_intercept': [True, False],
    'regressor__n_jobs': [-1]
}

runner = RegressorRunner(pipeline=pipeline, parameters=parameters)

runner.fit(X, y)

print ("Best parameters found: ")
print (runner.best_params)

scores = runner.get_scores(X, y)
print ("Expected performance: {:.2f}% (+/-{:.2f}).".format(np.mean(scores)*100., np.std(scores)*100.))

runner.apply_predicition_to_df(test_x, test_df, output_filename="results/test_estimated_with_ln.csv")

Best parameters found: 
{'regressor__fit_intercept': False, 'regressor__n_jobs': -1}
Expected performance: 79.85% (+/-7.18).


In [42]:
# Polynomial regressor
for degree in [2, 3, 4, 5]:
    model = make_pipeline(PolynomialFeatures(degree), Ridge())
    runner = RegressorRunner(pipeline=pipeline, parameters={})

    runner.fit(X, y)

    print ("Best parameters found for polynomial regression {}: ".format(degree))
    print (runner.best_params)

    scores = runner.get_scores(X, y)
    print ("Expected performance (degree {}): {:.2f}% (+/-{:.2f}).".format(degree, np.mean(scores)*100., np.std(scores)*100.))

    runner.apply_predicition_to_df(test_x, test_df, output_filename="results/test_estimated_with_poly_{}.csv".format(degree))


Best parameters found for polynomial regression 2: 
{}
Expected performance (degree 2): 79.51% (+/-8.06).
Best parameters found for polynomial regression 3: 
{}
Expected performance (degree 3): 79.51% (+/-8.06).
Best parameters found for polynomial regression 4: 
{}
Expected performance (degree 4): 79.51% (+/-8.06).
Best parameters found for polynomial regression 5: 
{}
Expected performance (degree 5): 79.51% (+/-8.06).


In [None]:
# Random forest regressor
pipeline = Pipeline([
    ("regressor", RandomForestRegressor())
])

parameters = {
    'regressor__n_estimators': [10, 20, 30, 40, 100], 
    'regressor__criterion': ["mse", "mae"],
    'regressor__max_features': ['sqrt', 'auto', 'log2', None],
    'regressor__min_samples_split': [2, 3, 10],
    'regressor__min_samples_leaf': [1, 3, 10],
    'regressor__bootstrap': [True, False],
    'regressor__n_jobs': [-1]
}

runner = RegressorRunner(pipeline=pipeline, parameters=parameters)

runner.fit(X, y)

print ("Best parameters found: ")
print (runner.best_params)

feature_importances = runner.feature_importances
print("Selected features by importance: {}".format(feature_importances))

scores = runner.get_scores(X, y)
print ("Expected performance: {:.2f}% (+/-{:.2f}).".format(np.mean(scores)*100., np.std(scores)*100.))

classifier.apply_predicition_to_df(test_x, test_df, output_filename="results/test_estimated_with_rf.csv")