In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from collections import defaultdict
import math
import re

# Load the data

In [None]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.describe()

# Fill nan values

In [None]:
# Get the columnas that have nan values
columns_without_nans = []
columns_with_nans = []
for column in train_df.columns:
    num_nans = train_df[column].isnull().values.sum()
    if num_nans > 0:
        print("{} (type {}): {} nans".format(column, train_df[column].dtype, num_nans))
        columns_with_nans.append(column)
    else:
        columns_without_nans.append(column)

In [None]:
categorical_columns = []
for column in columns_without_nans:
    column_type = train_df[column].dtype
    if column_type == "object":
        categorical_columns.append(column)

In [None]:
print("Categorical columns: {}".format(categorical_columns))

In [None]:
def encode_quality_columns(df):
    categorical_quality_columns = [
        "ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "HeatingQC", "KitchenQual",
        "FireplaceQu", "GarageQual", "GarageCond", "PoolQC"
    ]
    quality_labels= {
        "Ex": 5,
        "Gd": 4,
        "TA": 3,
        "Fa": 2,
        "Po": 1,
        "NA": 0
    }
    _encode_columns(quality_labels, categorical_quality_columns, df)

    
def encode_basement_rating_columns(df):
    categorical_basement_rating_columns = [
        "BsmtFinType1", "BsmtFinType2"
    ]
    basement_rating_label = {
        "GLQ": 6,
        "ALQ": 5,
        "BLQ": 4,
        "Rec": 3,
        "LwQ": 2,
        "Unf": 1,
        "NA": 0
    }
    _encode_columns(basement_rating_label, categorical_basement_rating_columns, df)


def encode_garage_finish_column(df):
    garage_finish_columns = [
        "GarageFinish"
    ]
    garage_finish_label= {
        "Fin": 3,
        "RFn": 2,
        "Unf": 1,
        "NA": 0
    }
    _encode_columns(garage_finish_label, garage_finish_columns, df)

    
def encode_utilities(df):
    columns = [
        "Utilities"
    ]
    utilities_labels= {
        "AllPub": 3,
        "NoSewr": 2,
        "NoSeWa": 1,
        "ELO": 0
    }
    _encode_columns(utilities_labels, columns, df)


def _encode_columns(label_encoding_correspondence, columns, df):
    for column in columns:
        df[column] = df[column].map(lambda cell: label_encoding_correspondence.get(cell, 0))
        df[column] = df[column].astype(int)


def encode_columns(df):
    encode_quality_columns(df)
    encode_basement_rating_columns(df)
    encode_garage_finish_column(df)


encode_columns(train_df)
encode_columns(test_df)

In [None]:
# Get the columnas that have nan values
columns_without_nans = []
columns_with_nans = []
for column in train_df.columns:
    num_nans = train_df[column].isnull().values.sum()
    if num_nans > 0:
        print("{} (type {}): {} nans".format(column, train_df[column].dtype, num_nans))
        columns_with_nans.append(column)
    else:
        columns_without_nans.append(column)

In [None]:
train_df.info()

In [None]:
# Fill the categorical columns, creating dummy (1/0) columns
expanded_train_df = pd.get_dummies(train_df, dummy_na=True)
expanded_test_df = pd.get_dummies(test_df, dummy_na=True)

In [None]:
# In case any column of test is not present in train, set it to zero
all_columns = set(expanded_train_df.columns).union(set(expanded_test_df.columns)) - set(["SalePrice"])
for column in all_columns:
    if column not in expanded_train_df.columns:
        expanded_train_df[column] = 0
    if column not in expanded_test_df.columns:
        expanded_test_df[column] = 0

In [None]:
# For each NAN fill it to the median value of that column
ready_train_df = expanded_train_df.fillna(expanded_train_df.median())
ready_test_df = expanded_test_df.fillna(expanded_test_df.median())

# New attributes

In [None]:
def add_new_features(df):
    # Built area in sq. feet: LotArea - 1stFlrSF
    df["BuiltAreaSF"] = df["LotArea"] - df["1stFlrSF"]
    # Total home area: 1stFlrSF + 2stFlSF + TotalBsmtSF
    df["TotalHomeAreaSF"] = df["1stFlrSF"] + df["2ndFlrSF"] + df["TotalBsmtSF"]

add_new_features(ready_train_df)
add_new_features(ready_test_df)

# All transforming process

In [None]:
def transform_input_data(df):
    encode_columns(df)
    df = pd.get_dummies(df, dummy_na=True)
    df.fillna(df.median(), inplace=True)
    add_new_features(df)
    return df

# Correlations

In [None]:
correlation_matrix = ready_train_df.corr()
correlation_values = correlation_matrix["SalePrice"].sort_values(ascending=False)
print(correlation_values)

In [None]:
feature_correlation_pairs = []
for feature, value in correlation_values.items():
    feature_correlation_pairs.append((feature, abs(value)))
    
sorted_feature_correlation_pairs = sorted(feature_correlation_pairs, key=lambda pair: pair[1], reverse=True)    

most_correlated_features = [
    feature_correlation_pair[0]
    for feature_correlation_pair in sorted_feature_correlation_pairs 
]
for sorted_feature_correlation_pair in sorted_feature_correlation_pairs:
    print(sorted_feature_correlation_pair)

In [None]:
number_of_best_features_to_keep = 25 

def drop_worst_features(df, most_correlated_features):
    print("Droping {} columns".format(len(most_correlated_features[number_of_best_features_to_keep:])))
    return df.drop(most_correlated_features[number_of_best_features_to_keep:], axis=1)

final_train_df = drop_worst_features(ready_train_df, most_correlated_features)
final_test_df = drop_worst_features(ready_test_df, most_correlated_features)

print("{} selected columns: {}".format(len(final_train_df.columns), final_train_df.columns))

In [None]:
# Prepare input data to regressors
y = final_train_df["SalePrice"].values

X = final_train_df.drop("SalePrice", axis=1).values

test_x = final_test_df.values

In [None]:
print y.shape
print X.shape
print test_x.shape

In [None]:
# Usefull runner
class RegressorRunner(object):
    
    def __init__(self, pipeline, parameters, cv=5, debug=True):
        self.pipeline = pipeline
        self.parameters = parameters
        self.grid_search = GridSearchCV(self.pipeline, self.parameters, cv=cv)
        self.debug = debug
        self.prediction = None
        self.X_train = None
        self.y_train = None
        
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        self.grid_search.fit(X, y)

    @property
    def best_params(self):
        return self.grid_search.best_params_
    
    @property
    def best_estimator(self):
        return self.grid_search.best_estimator_
    
    def get_scores(self, X, y, num_folds=5):
        scores = cross_val_score(self.grid_search.best_estimator_, X, y, cv=num_folds)
        return scores
    
    @property
    def feature_importances(self):
        classifier_step_index = 0
        for step_name, step_process in self.grid_search.best_estimator_.steps:
            if step_name == "regressor":
                break
            classifier_step_index += 1
        feature_importances = self.grid_search.best_estimator_.steps[classifier_step_index][1].feature_importances_
        return sorted(zip(feature_importances, selected_features), reverse=True)
    
    def predict(self, X_test):
        if self.prediction is None:
            self.prediction = self.grid_search.predict(X_test)
        return self.prediction
    
    def mean_squared_error(self, X, y):
        y_predicted = self.grid_search.predict(X)
        return mean_squared_error(y, y_predicted)

    def apply_predicition_to_df(self, X_test, test_df, output_filename, estimator_for_negatives=None):
        if self.prediction is None:
            self.predict(X_test)
        # Add the prediction to the test dataset
        estimated_test_df = test_df.assign(SalePrice=list(self.prediction))
        
        # Count nevative prices
        if self.debug:
            number_of_negative_prices = 0
            for i, row in estimated_test_df.iterrows():
                    if row["SalePrice"] <= 0:
                        number_of_negative_prices += 1
            print("{} houses have negative prices of {}".format(number_of_negative_prices, estimated_test_df.shape[0]))
        
        # Assign correct values to negative prices
        if estimator_for_negatives is None:
            # IMPORTANT PATCH: NO SALE PRICE MUST BE NEGATIVE
            # In case there is any negative SalePrice, set it to 0
            estimated_test_df["SalePrice"] = estimated_test_df["SalePrice"].map(
                lambda sale_price: np.nan if sale_price < 0 else sale_price
            )

            estimated_test_df["SalePrice"].fillna(estimated_test_df["SalePrice"].median(), inplace=True)
        
        else:
            
            for i, row in estimated_test_df.iterrows():
                if row["SalePrice"] <= 0:
                    row_df = pd.DataFrame(row, columns=estimated_test_df.columns)
                    row_df_X = drop_worst_features(transform_input_data(row_df).drop("SalePrice", axis=1)).values
                    positive_sale_price_y = estimator_for_negatives.predict(row_df_X)
                    estimated_test_df.set_value(i, 'SalePrice', positive_sale_price_y[0])
        
        # Save 
        estimated_test_df.to_csv(output_filename, columns=["Id", "SalePrice"], index=False)

In [None]:
# KNN regressor
pipeline = Pipeline([
    ("regressor", KNeighborsRegressor())
])

parameters = { 
    'regressor__n_neighbors': [3, 5, 7, 10],
    'regressor__weights': ["uniform", "distance"],
    'regressor__algorithm': ["auto", "ball_tree", "kd_tree", "brute"],
    'regressor__n_jobs': [-1]
}

knn_runner = RegressorRunner(pipeline=pipeline, parameters=parameters)

knn_runner.fit(X, y)

print ("Best parameters found for KNN regression: ")
print (knn_runner.best_params)

scores = knn_runner.get_scores(X, y)
print("Mean of CV scores data {}".format(np.mean(scores)))

rmse = knn_runner.mean_squared_error(X, y)
print("RMSE of training data {}".format(rmse))

knn_runner.apply_predicition_to_df(test_x, test_df, output_filename="results/test_estimated_with_knn.csv")

knn_estimator = knn_runner.best_estimator

In [None]:
# Linear regressor
pipeline = Pipeline([
    ("regressor", LinearRegression())
])

parameters = { 
    'regressor__fit_intercept': [True, False],
    'regressor__n_jobs': [-1]
}

runner = RegressorRunner(pipeline=pipeline, parameters=parameters)

runner.fit(X, y)

print ("Best parameters found for Linear regression: ")
print (runner.best_params)

scores = runner.get_scores(X, y)
print("Mean of CV scores data {}".format(np.mean(scores)))

rmse = runner.mean_squared_error(X, y)
print("RMSE of training data {}".format(rmse))

runner.apply_predicition_to_df(test_x, test_df, estimator_for_negatives=None, output_filename="results/test_estimated_with_ln.csv")

In [None]:
# Polynomial regressor
for degree in [2, 3, 4, 5]:
    print("Polynomial regression {}: ".format(degree))
    pipeline = make_pipeline(PolynomialFeatures(degree), Ridge())
    runner = RegressorRunner(pipeline=pipeline, parameters={})

    runner.fit(X, y)

    print ("- Best parameters found for polynomial regression {}: {}".format(degree, runner.best_params))

    scores = runner.get_scores(X, y)
    print("- Mean of CV scores data {}".format(np.mean(scores)))

    rmse = runner.mean_squared_error(X, y)
    print("- RMSE of training data {}".format(rmse))
    
    runner.apply_predicition_to_df(test_x, test_df, output_filename="results/test_estimated_with_poly_{}.csv".format(degree))

    print("")

In [None]:
# Defision tree regressor
pipeline = Pipeline([
    ("regressor", DecisionTreeRegressor())
])

parameters = { 
    'regressor__criterion': ["mse", "mae", "friedman_mse"],
    'regressor__random_state': [42],
    'regressor__max_depth': [100, 300, 500, 1000],
    'regressor__max_features': ['sqrt', 'auto', 'log2', None],
    'regressor__min_samples_split': [2, 3, 10],
    'regressor__min_samples_leaf': [1, 3, 10],
    'regressor__presort': [True, False]
}

runner = RegressorRunner(pipeline=pipeline, parameters=parameters)

runner.fit(X, y)

print ("Best parameters found for Decision Tree regression: ")
print (runner.best_params)

print("- Mean of CV scores data {}".format(np.mean(scores)))

rmse = runner.mean_squared_error(X, y)
print("- RMSE of training data {}".format(rmse))

runner.apply_predicition_to_df(test_x, test_df, estimator_for_negatives=None, output_filename="results/test_estimated_with_dt.csv")

In [None]:
# Gaussian Process regressor
pipeline = Pipeline([
    ("regressor", GaussianProcessRegressor())
])


parameters = {
    'regressor__random_state': [1,2,3,4,5]
}

runner = RegressorRunner(pipeline=pipeline, parameters=parameters)

runner.fit(X, y)

print ("Best parameters found for Gaussian Process regression: ")
print (runner.best_params)

print("- Mean of CV scores data {}".format(np.mean(scores)))

rmse = runner.mean_squared_error(X, y)
print("- RMSE of training data {}".format(rmse))

runner.apply_predicition_to_df(test_x, test_df, estimator_for_negatives=None, output_filename="results/test_estimated_with_gp.csv")

In [None]:
# Random forest regressor
pipeline = Pipeline([
    ("regressor", RandomForestRegressor())
])

best_parameters = {
    'regressor__n_estimators': [100], 
    'regressor__criterion': ["mae"],
    'regressor__max_features': ['sqrt'],
    'regressor__min_samples_split': [3],
    'regressor__min_samples_leaf': [1],
    'regressor__bootstrap': [False],
    'regressor__n_jobs': [-1]
}

parameters = {
    'regressor__n_estimators': [10, 20, 30, 40, 100], 
    'regressor__criterion': ["mse", "mae"],
    'regressor__max_features': ['sqrt', 'auto', 'log2', None],
    'regressor__min_samples_split': [2, 3, 10],
    'regressor__min_samples_leaf': [1, 3, 10],
    'regressor__bootstrap': [True, False],
    'regressor__n_jobs': [-1]
}

runner = RegressorRunner(pipeline=pipeline, parameters=best_parameters)

runner.fit(X, y)

print ("Best parameters found for RF regression: ")
print (runner.best_params)

print("- Mean of CV scores data {}".format(np.mean(scores)))

rmse = runner.mean_squared_error(X, y)
print("- RMSE of training data {}".format(rmse))

runner.apply_predicition_to_df(test_x, test_df, estimator_for_negatives=None, output_filename="results/test_estimated_with_rf.csv")

In [None]:
# SVM regressor
pipeline = Pipeline([
    ("regressor", SVR())
])

parameters = {
    'regressor__C': [1, 2], 
    'regressor__epsilon': [0.1, 0.05],
    'regressor__kernel': ['rbf', 'linear', 'poly', "sigmoid", "precomputed"]
}

runner = RegressorRunner(pipeline=pipeline, parameters=parameters)

runner.fit(X, y)

print ("Best parameters found for SVM regression: ")
print (runner.best_params)

print("- Mean of CV scores data {}".format(np.mean(scores)))

rmse = runner.mean_squared_error(X, y)
print("- RMSE of training data {}".format(rmse))

runner.apply_predicition_to_df(test_x, test_df, estimator_for_negatives=None, output_filename="results/test_estimated_with_SVM.csv")