In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from collections import defaultdict
import math
import re

# Load the data

In [3]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

In [4]:
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [6]:
train_df.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


# Fill nan values

In [7]:
# Get the columnas that have nan values
columns_without_nans = []
columns_with_nans = []
for column in train_df.columns:
    num_nans = train_df[column].isnull().values.sum()
    if num_nans > 0:
        print("{} (type {}): {} nans".format(column, train_df[column].dtype, num_nans))
        columns_with_nans.append(column)
    else:
        columns_without_nans.append(column)

LotFrontage (type float64): 259 nans
Alley (type object): 1369 nans
MasVnrType (type object): 8 nans
MasVnrArea (type float64): 8 nans
BsmtQual (type object): 37 nans
BsmtCond (type object): 37 nans
BsmtExposure (type object): 38 nans
BsmtFinType1 (type object): 37 nans
BsmtFinType2 (type object): 38 nans
Electrical (type object): 1 nans
FireplaceQu (type object): 690 nans
GarageType (type object): 81 nans
GarageYrBlt (type float64): 81 nans
GarageFinish (type object): 81 nans
GarageQual (type object): 81 nans
GarageCond (type object): 81 nans
PoolQC (type object): 1453 nans
Fence (type object): 1179 nans
MiscFeature (type object): 1406 nans


In [8]:
categorical_columns = []
for column in columns_without_nans:
    column_type = train_df[column].dtype
    if column_type == "object":
        categorical_columns.append(column)

In [9]:
print("Categorical columns: {}".format(categorical_columns))

Categorical columns: ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive', 'SaleType', 'SaleCondition']


In [10]:
# Fill the categorical columns, creating dummy (1/0) columns
expanded_train_df = pd.get_dummies(train_df, dummy_na=True)
expanded_test_df = pd.get_dummies(test_df, dummy_na=True)

In [11]:
# In case any column of test is not present in train, set it to zero
all_columns = set(expanded_train_df.columns).union(set(expanded_test_df.columns)) - set(["SalePrice"])
for column in all_columns:
    if column not in expanded_train_df.columns:
        expanded_train_df[column] = 0
    if column not in expanded_test_df.columns:
        expanded_test_df[column] = 0

In [12]:
# For each NAN fill it to the median value of that column
ready_train_df = expanded_train_df.fillna(expanded_train_df.median())
ready_test_df = expanded_test_df.fillna(expanded_test_df.median())

# New attributes

In [13]:
# Mean sq. feet per room: Bedroom/
# Bedrooms per rooms
# Total sq. feet
# Built area in sq. feet: LotArea - 1stFlrSF - MasVnrArea
# Total home area: 1stFlrSF + 2stFlSF + TotalBsmtSF

# Correlations

In [14]:
correlation_matrix = ready_train_df.corr()
correlation_values = correlation_matrix["SalePrice"].sort_values(ascending=False)
print(correlation_values)

SalePrice                1.000000
OverallQual              0.790982
GrLivArea                0.708624
GarageCars               0.640409
GarageArea               0.623431
TotalBsmtSF              0.613581
1stFlrSF                 0.605852
FullBath                 0.560664
BsmtQual_Ex              0.553105
TotRmsAbvGrd             0.533723
YearBuilt                0.522897
YearRemodAdd             0.507101
KitchenQual_Ex           0.504094
Foundation_PConc         0.497734
MasVnrArea               0.472614
Fireplaces               0.466929
GarageYrBlt              0.466754
ExterQual_Gd             0.452466
ExterQual_Ex             0.451164
BsmtFinType1_GLQ         0.434597
HeatingQC_Ex             0.434543
GarageFinish_Fin         0.419678
Neighborhood_NridgHt     0.402149
BsmtFinSF1               0.386420
SaleType_New             0.357509
SaleCondition_Partial    0.352060
FireplaceQu_Gd           0.339329
GarageType_Attchd        0.335961
LotFrontage              0.334771
MasVnrType_Sto

In [15]:
feature_correlation_pairs = []
for feature, value in correlation_values.items():
    feature_correlation_pairs.append((feature, abs(value)))
    
sorted_feature_correlation_pairs = sorted(feature_correlation_pairs, key=lambda pair: pair[1], reverse=True)    

most_correlated_features = [
    feature_correlation_pair[0]
    for feature_correlation_pair in sorted_feature_correlation_pairs 
]
for sorted_feature_correlation_pair in sorted_feature_correlation_pairs:
    print(sorted_feature_correlation_pair)

('SalePrice', 1.0)
('OverallQual', 0.7909816005838047)
('GrLivArea', 0.7086244776126511)
('GarageCars', 0.640409197258349)
('GarageArea', 0.6234314389183598)
('TotalBsmtSF', 0.6135805515591944)
('1stFlrSF', 0.6058521846919166)
('ExterQual_TA', 0.589043523409763)
('FullBath', 0.5606637627484452)
('BsmtQual_Ex', 0.5531048470089422)
('TotRmsAbvGrd', 0.5337231555820238)
('YearBuilt', 0.5228973328794967)
('KitchenQual_TA', 0.5192978536548846)
('YearRemodAdd', 0.5071009671113867)
('KitchenQual_Ex', 0.5040936759052956)
('Foundation_PConc', 0.4977337525869438)
('MasVnrArea', 0.47261449900457725)
('FireplaceQu_nan', 0.4719080685164922)
('Fireplaces', 0.4669288367515242)
('GarageYrBlt', 0.4667536523633402)
('ExterQual_Gd', 0.45246612784479223)
('BsmtQual_TA', 0.45239353235010327)
('ExterQual_Ex', 0.45116433022275354)
('BsmtFinType1_GLQ', 0.4345973468827756)
('HeatingQC_Ex', 0.434543238532467)
('GarageFinish_Fin', 0.41967796781801714)
('GarageFinish_Unf', 0.41060831129167175)
('Neighborhood_Nridg

In [16]:
number_of_best_features_to_keep = 20 

print("Droping {} columns".format(len(most_correlated_features[number_of_best_features_to_keep:])))

final_train_df = ready_train_df.drop(most_correlated_features[number_of_best_features_to_keep:], axis=1)

final_test_df = ready_test_df.drop(most_correlated_features[number_of_best_features_to_keep:], axis=1)

print("{} selected columns: {}".format(len(final_train_df.columns), final_train_df.columns))

Droping 313 columns
20 selected columns: Index([u'OverallQual', u'YearBuilt', u'YearRemodAdd', u'MasVnrArea',
       u'TotalBsmtSF', u'1stFlrSF', u'GrLivArea', u'FullBath', u'TotRmsAbvGrd',
       u'Fireplaces', u'GarageYrBlt', u'GarageCars', u'GarageArea',
       u'SalePrice', u'ExterQual_TA', u'Foundation_PConc', u'BsmtQual_Ex',
       u'KitchenQual_Ex', u'KitchenQual_TA', u'FireplaceQu_nan'],
      dtype='object')


In [17]:
# Prepare input data to regressors
y = final_train_df["SalePrice"].values

X = final_train_df.drop("SalePrice", axis=1).values

test_x = final_test_df.values

In [18]:
print y.shape
print X.shape
print test_x.shape

(1460,)
(1460, 19)
(1459, 19)


In [43]:
# Usefull runner
class RegressorRunner(object):
    
    def __init__(self, pipeline, parameters, cv=5, debug=False):
        self.pipeline = pipeline
        self.parameters = parameters
        self.grid_search = GridSearchCV(self.pipeline, self.parameters, cv=cv)
        self.debug = debug
        self.prediction = None
        self.X_train = None
        self.y_train = None
        
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        self.grid_search.fit(X, y)

    @property
    def best_params(self):
        return self.grid_search.best_params_
    
    @property
    def best_estimator(self):
        return self.grid_search.best_estimator_
    
    def get_scores(self, X, y, num_folds=5):
        scores = cross_val_score(self.grid_search.best_estimator_, X, y, cv=num_folds)
        return scores
    
    @property
    def feature_importances(self):
        classifier_step_index = 0
        for step_name, step_process in self.grid_search.best_estimator_.steps:
            if step_name == "regressor":
                break
            classifier_step_index += 1
        feature_importances = self.grid_search.best_estimator_.steps[classifier_step_index][1].feature_importances_
        return sorted(zip(feature_importances, selected_features), reverse=True)
    
    def predict(self, X_test):
        if self.prediction is None:
            self.prediction = self.grid_search.predict(X_test)
        return self.prediction
    
    def mean_squared_error(self, X, y):
        y_predicted = self.grid_search.predict(X)
        return mean_squared_error(y, y_predicted)

    def apply_predicition_to_df(self, X_test, test_df, output_filename, knn_estimator=None):
        if self.prediction is None:
            self.predict(X_test)
        # Add the prediction to the test dataset
        estimated_test_df = test_df.assign(SalePrice=list(self.prediction))
        
        if knn_estimator is None:
            # IMPORTANT PATCH: NO SALE PRICE MUST BE NEGATIVE
            # In case there is any negative SalePrice, set it to 0
            estimated_test_df["SalePrice"] = estimated_test_df["SalePrice"].map(
                lambda sale_price: np.nan if sale_price < 0 else sale_price
            )

            estimated_test_df["SalePrice"].fillna(estimated_test_df["SalePrice"].median(), inplace=True)
        
        else:
            
            for i, row in estimated_test_df.iterrows():
                if row["SalePrice"] <= 0:
                    positive_sale_price = knn_estimator.predict(row.drop("SalePrice", axis=1))
                    estimated_test_df.set_value(i, 'SalePrice', positive_sale_price)
        
        # Save 
        estimated_test_df.to_csv(output_filename, columns=["Id", "SalePrice"], index=False)

In [44]:
# KNN regressor
pipeline = Pipeline([
    ("regressor", KNeighborsRegressor())
])

parameters = { 
    'regressor__n_neighbors': [3, 5, 7, 10],
    'regressor__weights': ["uniform", "distance"],
    'regressor__algorithm': ["auto", "ball_tree", "kd_tree", "brute"],
    'regressor__n_jobs': [-1]
}

knn_runner = RegressorRunner(pipeline=pipeline, parameters=parameters)

knn_runner.fit(X, y)

print ("Best parameters found: ")
print (knn_runner.best_params)

scores = knn_runner.get_scores(X, y)
print("Mean of CV scores data {}".format(np.mean(scores)))

rmse = knn_runner.mean_squared_error(X, y)
print("RMSE of training data {}".format(rmse))

knn_runner.apply_predicition_to_df(test_x, test_df, output_filename="results/test_estimated_with_knn.csv")

knn_estimator = knn_runner.best_estimator

Best parameters found: 
{'regressor__algorithm': 'brute', 'regressor__n_jobs': -1, 'regressor__weights': 'distance', 'regressor__n_neighbors': 10}
Mean of CV scores data 0.734059956268
RMSE of training data 387984.765068


In [45]:
# Linear regressor
pipeline = Pipeline([
    ("regressor", LinearRegression())
])

parameters = { 
    'regressor__fit_intercept': [True, False],
    'regressor__n_jobs': [-1]
}

runner = RegressorRunner(pipeline=pipeline, parameters=parameters)

runner.fit(X, y)

print ("Best parameters found: ")
print (runner.best_params)

scores = runner.get_scores(X, y)
print("Mean of CV scores data {}".format(np.mean(scores)))

rmse = runner.mean_squared_error(X, y)
print("RMSE of training data {}".format(rmse))

runner.apply_predicition_to_df(test_x, test_df, knn_estimator=knn_estimator, output_filename="results/test_estimated_with_ln.csv")

Best parameters found: 
{'regressor__fit_intercept': True, 'regressor__n_jobs': -1}
Mean of CV scores data 0.79502802337
RMSE of training data 1183235001.39


In [46]:
# Polynomial regressor
for degree in [2, 3, 4, 5]:
    print("Polynomial regression {}: ".format(degree))
    pipeline = make_pipeline(PolynomialFeatures(degree), Ridge())
    runner = RegressorRunner(pipeline=pipeline, parameters={})

    runner.fit(X, y)

    print ("- Best parameters found for polynomial regression {}: {}".format(degree, runner.best_params))

    scores = runner.get_scores(X, y)
    print("- Mean of CV scores data {}".format(np.mean(scores)))

    rmse = runner.mean_squared_error(X, y)
    print("- RMSE of training data {}".format(rmse))
    
    runner.apply_predicition_to_df(test_x, test_df, output_filename="results/test_estimated_with_poly_{}.csv".format(degree))

    print("")

Polynomial regression 2: 


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 2.31903442362e-17 / 1.11022302463e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 2.57049480325e-17 / 1.11022302463e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 2.43694621684e-17 / 1.11022302463e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 2.25943878424e-17 / 1.11022302463e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 3.21961902121e-17 / 1.11022302463e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 2.01438492708e-17 / 1.11022302463e-16


- Best parameters found for polynomial regression 2: {}
- Mean of CV scores data 0.721813646695
- RMSE of training data 607881360.19

Polynomial regression 3: 




- Best parameters found for polynomial regression 3: {}
- Mean of CV scores data -27.7750339835
- RMSE of training data 4389234834.98


ValueError: No axis named 1 for object type <class 'pandas.core.series.Series'>

In [49]:
# Random forest regressor
pipeline = Pipeline([
    ("regressor", RandomForestRegressor())
])

best_parameters = {
    'regressor__n_estimators': [200], 
    'regressor__criterion': ["mse"],
    'regressor__max_features': ['sqrt'],
    'regressor__min_samples_split': [2],
    'regressor__min_samples_leaf': [1],
    'regressor__bootstrap': [False],
    'regressor__n_jobs': [-1]
}

parameters = {
    'regressor__n_estimators': [10, 20, 30, 40, 100], 
    'regressor__criterion': ["mse", "mae"],
    'regressor__max_features': ['sqrt', 'auto', 'log2', None],
    'regressor__min_samples_split': [2, 3, 10],
    'regressor__min_samples_leaf': [1, 3, 10],
    'regressor__bootstrap': [True, False],
    'regressor__n_jobs': [-1]
}

runner = RegressorRunner(pipeline=pipeline, parameters=best_parameters)

runner.fit(X, y)

print ("Best parameters found: ")
print (runner.best_params)

print("- Mean of CV scores data {}".format(np.mean(scores)))

rmse = runner.mean_squared_error(X, y)
print("- RMSE of training data {}".format(rmse))

runner.apply_predicition_to_df(test_x, test_df, knn_estimator=knn_estimator, output_filename="results/test_estimated_with_rf.csv")

Best parameters found: 
{'regressor__min_samples_split': 2, 'regressor__bootstrap': False, 'regressor__max_features': 'sqrt', 'regressor__min_samples_leaf': 1, 'regressor__n_estimators': 200, 'regressor__criterion': 'mse', 'regressor__n_jobs': -1}
- Mean of CV scores data -27.7750339835
- RMSE of training data 388029.259932
