In [1]:
%matplotlib notebook
# We start off with the baseline import statements we need to do the basic data manipulation and visualization.
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.pyplot as plt
import calendar
from sklearn.ensemble import RandomForestClassifier

sns.set_style("whitegrid")

#We create and set aside a copy of the data for initial exploration
housing_train = pd.read_csv('../data/train.csv')
housing = housing_train.copy()

#MISSING DATA
total = housing.isnull().sum().sort_values(ascending=False)
percent = (housing.isnull().sum()/housing.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

#CORRELATION CHECK
corr_matrix = housing.corr()
top_corr = corr_matrix['SalePrice'].sort_values(ascending = False)

In [2]:
#DROPPING SOME COLUMNS
drop = ['PoolQC', 'PoolArea','MiscFeature', 'MiscVal', 'Alley', 'Fence', 'FireplaceQu', 'Fireplaces', 'LotFrontage']
drop2 = ['Id','GarageArea','1stFlrSF','GarageYrBlt']
housing.drop(columns = drop + drop2, inplace = True)
housing['Age'] = housing['YrSold'] - housing['YearBuilt']
housing['AgeRemodel'] = housing['YrSold'] - housing['YearRemodAdd']
housing = housing[housing.AgeRemodel >= 0]
housing.drop(columns = ['YearBuilt','YearRemodAdd'], inplace = True)

In [3]:
#FURTHER DATA CLEANING
housing_cat = housing.select_dtypes(exclude=[np.number])
housing_numeric = housing.select_dtypes(include=[np.number])

#Numeric
numeric_unbounded = ['LotArea', 'MasVnrArea','BsmtFinSF1','BsmtFinSF2', 'BsmtUnfSF',
                     'TotalBsmtSF','2ndFlrSF','LowQualFinSF','GrLivArea','WoodDeckSF',
                     'OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch', 'SalePrice',
                     'Age','AgeRemodel']

numeric_one_hot = ['MSSubClass','MoSold']

numeric_ordinal = [x for x in housing_numeric.columns 
                   if (x not in numeric_unbounded and x not in numeric_one_hot)]

housing_numeric_unbounded = housing_numeric[numeric_unbounded]
housing_numeric_one_hot = housing_numeric[numeric_one_hot]
housing_numeric_ordinal = housing_numeric[numeric_ordinal]

housing_numeric_one_hot['MSSubClass'] = housing_numeric_one_hot['MSSubClass'].astype('str')
housing_numeric_one_hot['MoSold'] = housing_numeric_one_hot['MoSold'].replace({i:calendar.month_name[i][:3] for i in range(1,13)})
housing_numeric_one_hot = pd.get_dummies(housing_numeric_one_hot)

#Categorical
cat_ordinal = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
               'BsmtFinType1', 'HeatingQC', 'KitchenQual','Functional','GarageFinish',
               'GarageQual', 'GarageCond']

housing_cat_ordinal = housing_cat[cat_ordinal]
housing_cat_ordinal.fillna('None', inplace = True)
housing_cat_one_hot = housing_cat.drop(columns = cat_ordinal)
housing_cat_one_hot = pd.get_dummies(housing_cat_one_hot)

def mapper(cat):
    if cat in ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond',
               'HeatingQC', 'KitchenQual']:
        mapper = {'None':0, 'Po':1, 'Fa':2,'TA':3,'Gd':4,'Ex':5}
    elif cat == 'BsmtExposure':
            mapper = {'None':0,'No':1, 'Mn':2, 'Av':3,'Gd':4}
    elif cat == 'BsmtFinType1':
        mapper = {'None':0,'Unf':1,'LwQ':2,'Rec':3,'BLQ':4,'ALQ':5,'GLQ':6}
    elif cat == 'Functional':
        mapper = {'Sal':0,'Sev':1,'Maj2':2,'Maj1':3,'Mod':4,'Min2':5, 'Min1':6,'Typ':7}
    else:
        mapper = {'None':0,'Unf':1,'RFn':2,'Fin':3}
        
    return mapper

for cat in cat_ordinal:
    housing_cat_ordinal[cat].replace(mapper(cat), inplace = True)

#Combining numeric and categorical
housing_ordinal = pd.concat([housing_numeric_ordinal,housing_cat_ordinal], axis = 'columns')
housing_one_hot = pd.concat([housing_numeric_one_hot, housing_cat_one_hot], axis = 'columns')
housing_clean = pd.concat([housing_one_hot, housing_ordinal, housing_numeric_unbounded], 
                          axis = 'columns')

#MORE CORRELATION
ordinal_prices = pd.concat([housing_ordinal, housing['SalePrice']], axis = 'columns')
ordinal_corr_matrix = ordinal_prices.corr()
top_corr_ordinal = ordinal_corr_matrix['SalePrice'].sort_values(ascending = False)

one_hot_prices = pd.concat([housing_one_hot, housing['SalePrice']], axis = 'columns')
one_hot_corr_matrix = one_hot_prices.corr()
top_corr_one_hot = one_hot_corr_matrix['SalePrice'].filter(like = 'Neighborhood').sort_values(ascending = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  housing_numeric_one_hot['MSSubClass'] = housing_numeric_one_hot['MSSubClass'].astype('str')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  housing_numeric_one_hot['MoSold'] = housing_numeric_one_hot['MoSold'].replace({i:calendar.month_name[i][:3] for i in range(1,13)})
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  housing_cat_ordinal.fillna('None', inpl

In [19]:
#REMOVAL OF THE REMAINING NaN
housing_clean.isnull().sum().sort_values(ascending=False)
df = housing_clean.copy()
problem_col = df.isin([np.nan, np.inf, -np.inf]).sum(axis=0)[df.isin([np.nan, np.inf, -np.inf]).sum(axis=0) != 0] 
index_to_drop = df[problem_col.index[0]][df[problem_col.index[0]].isin([np.nan, np.inf, -np.inf])].index
df.drop(index = index_to_drop, inplace = True)
# df.isnull().sum().sort_values(ascending=False)

In [20]:
#RANDOM FOREST FOR FEATURE IMPORTANCE
X_train = df.drop(columns = ['SalePrice'])
y_train = df['SalePrice']

forest = RandomForestClassifier(n_estimators=500, max_depth=4)

forest.fit(X_train, y_train)

forest.feature_importances_
score_df = pd.DataFrame({'feature':X_train.columns,
                            'importance_score': forest.feature_importances_})

In [21]:
#We will look at feature importances and their correlation with the 'SalePrice'
score_df.sort_values('feature', inplace=True)
top_corr = df.corr()['SalePrice'].abs().drop(index = ['SalePrice']) #I suppose we want to look at the absolute value
                                                                    #of the correlation. Is that right?
top_corr.sort_index(inplace=True) 
#now rows of score_df and top_corr match and we can add the values of correlation
score_df['correlation'] = top_corr.values

In [73]:
score_df.sort_values(ascending=False, by = ['importance_score'], inplace = True)
score_df['importance_score_rank'] = [k for k in range(1,1+len(score_df.index))]
score_df.sort_values(ascending=False, by = ['correlation'], inplace = True)
score_df['correlation_rank'] = [k for k in range(1,1+len(score_df.index))]
score_df['overall_rank'] = (score_df['importance_score_rank'] + score_df['correlation_rank'])/2 
# score_df.sort_values(ascending=True, by = ['overall_rank'], inplace = True)
score_df.sort_values(ascending=True, by = ['correlation_rank'], inplace = True)
score_df.reset_index(drop = True, inplace = True)
score_df.head(20)

Unnamed: 0,feature,importance_score,correlation,importance_score_rank,correlation_rank,overall_rank
0,OverallQual,0.02684,0.792084,4,1,2.5
1,GrLivArea,0.032454,0.718891,1,2,1.5
2,ExterQual,0.012564,0.68292,20,3,11.5
3,KitchenQual,0.013004,0.659501,19,4,11.5
4,GarageCars,0.016247,0.640228,13,5,9.0
5,TotalBsmtSF,0.024386,0.61763,6,6,6.0
6,BsmtQual,0.013732,0.584715,17,7,12.0
7,FullBath,0.020979,0.563726,8,8,8.0
8,GarageFinish,0.009187,0.549291,27,9,18.0
9,TotRmsAbvGrd,0.019372,0.537623,10,10,10.0


Correlation measures only linear dependence between variables and it does not detect non-linear dependence (in particular cor(X,Y) can be 0 for random variables X and Y=X^2, which are of course completely dependent). So, if a given feature has high feature_importance score, but low correlation it means that 'SalePrice' depend on it in a non-linear manner.

In [10]:
# plt.figure(figsize=(7, 7))
# plt.scatter(score_df['importance_score_rank'][:20], score_df['correlation_rank'][:20], c ="blue",
#             linewidths = 1)
# plt.title('importance_rank vs correlation_rank')
# plt.xticks(np.arange(0, 40, step=1))
# plt.yticks(np.arange(0, 40, step=1))
# plt.xlabel("importance_score_rank")
# plt.ylabel("correlation_rank")
# plt.show()

FITTING MODELS

In [171]:
from sklearn import linear_model
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

In [198]:
def eval(y_hat, y):
    print("Mean absolute error: %.2f" % np.mean(np.absolute(y_hat - y)))
    print("Residual sum of squares (MSE): %.2f" % np.mean((y_hat - y) ** 2))
    print( f'Normalized sum of squares error: {round(100*np.mean((y_hat - y) ** 2) / (np.mean(y ** 2)), 2)}%' )
    print("R2-score: %.2f" % r2_score(y_hat, y) )
    return [round(np.mean(np.absolute(y_hat - y)), 2), round(np.mean((y_hat - y) ** 2),2), 
            round(np.mean((y_hat - y) ** 2) / (np.mean(y ** 2)),2), 
            round(r2_score(y_hat, y),2)]

In [202]:
#Train-test split
msk = np.random.rand(len(df)) < 0.8
train = df[msk]
test = df[~msk]

metrics = {}

#MODEL 1: Simple linear regression
print('=====================')
print('Simple linear regression')
print('=====================')

#a) Data preparation
X_train = train['GrLivArea'].values.reshape(-1,1)
y_train = train['SalePrice']
X_test = test['GrLivArea'].values.reshape(-1,1)
y_test = test['SalePrice']

#b) Fitting the model
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
print ('Coefficients in simple linear regression: ', [round(num,2) for num in regr.coef_])
print ('Intercept in simple linear regression: ', round(regr.intercept_,2))

#c) Drawing a graph
# plt.scatter(train.GrLivArea, train.SalePrice,  color='blue')
# plt.plot(train.GrLivArea, regr.coef_*train.GrLivArea + regr.intercept_, '-r')
# plt.title('GrLivArea vs SalePrice')
# plt.xlabel("GrLivArea")
# plt.ylabel("SalePrice")

#d) Prediction and evaluation
y_test_hat = regr.predict(X_test)
metrics['SLR'] = eval(y_test_hat, y_test )
print('Variance score: %.2f' % regr.score(X_test, y_test))

#MODEL 2: Multi-linear regression with all features
print('=====================')
print('Multi-linear regression with all features')
print('=====================')
#a) Data preparation
X_train = train.drop(columns = ['SalePrice'])
y_train = train['SalePrice']
X_test = test.drop(columns = ['SalePrice'])
y_test = test['SalePrice']

#b) Fitting the model
ml_regr = linear_model.LinearRegression()
ml_regr.fit(X_train, y_train)
print (f'First 20 coefficients in multilinear regression: {[round(num, 2) for num in ml_regr.coef_[:20]]}')
print ('Intercept in multilinear regression: %.2f' % round(ml_regr.intercept_,2))

#c) Drawing a graph - not clear how to do it in this case.

#d) Prediction and evaluation
y_test_hat = ml_regr.predict(X_test)
metrics['MLR'] = eval(y_test_hat, y_test)
print('Variance score: %.2f' % ml_regr.score(X_test, y_test))

#MODEL 3: Multi-linear regression using 3 features most correlated with the SalePrice
print('=====================')
print('Multi-linear regression using 3 features most correlated with the SalePrice')
print('=====================')
#a) Data preparation
X_train = train[['GrLivArea','OverallQual', 'ExterQual']]
y_train = train['SalePrice']
X_test = test[['GrLivArea','OverallQual', 'ExterQual']]
y_test = test['SalePrice']

#b) Fitting the model
ml3_regr = linear_model.LinearRegression()
ml3_regr.fit(X_train, y_train)
print (f'Coefficients in multilinear regression:{[round(num, 2) for num in ml3_regr.coef_]}')
print ('Intercept in multilinear regression: ', round(ml3_regr.intercept_,2))

#c) Drawing a graph - not clear how to do it in this case.

#d) Prediction and evaluation
y_test_hat = ml3_regr.predict(X_test)
metrics['MLR3'] = eval(y_test_hat, y_test)

Simple linear regression
Coefficients in simple linear regression:  [106.58]
Intercept in simple linear regression:  19005.15
Mean absolute error: 39865.08
Residual sum of squares (MSE): 3435215272.80
Normalized sum of squares error: 8.15%
R2-score: -0.04
Variance score: 0.55
Multi-linear regression with all features
First 20 coefficients in multilinear regression: [-11256.57, -13636.53, -9421.19, -27952.58, 7338.24, 8517.66, -3839.02, -7559.42, 14461.53, 12570.95, 17394.75, 7069.02, 2026.83, 6143.65, -1857.31, 1425.0, -94.88, -886.71, -6886.0, 4036.59]
Intercept in multilinear regression: -620128.46
Mean absolute error: 18439.88
Residual sum of squares (MSE): 872213869.77
Normalized sum of squares error: 2.07%
R2-score: 0.85
Variance score: 0.88
Multi-linear regression using 3 features most correlated with the SalePrice
Coefficients in multilinear regression:[53.74, 22981.98, 31857.67]
Intercept in multilinear regression:  -148897.9
Mean absolute error: 28640.00
Residual sum of square

In [203]:
#MODEL 4: k nearest neighbors regression

#a) Data preparation 
X_train = train.drop(columns = ['SalePrice'])
y_train = train['SalePrice']
X_test = test.drop(columns = ['SalePrice'])
y_test = test['SalePrice']

#d) Prediction and evaluation
for x in range(1, 10):
    y_test_hat = KNeighborsRegressor(x).fit(X_train,y_train).predict(X_test)
    print('=====================')
    print(f'KNN regression with k = {x}')
    print('=====================')
    metrics[f'KNN_{x}']= eval(y_test_hat, y_test)

KNN regression with k = 1
Mean absolute error: 39501.98
Residual sum of squares (MSE): 3672220449.47
Normalized sum of squares error: 8.71%
R2-score: 0.29
KNN regression with k = 2
Mean absolute error: 36329.46
Residual sum of squares (MSE): 3466038146.86
Normalized sum of squares error: 8.22%
R2-score: 0.18
KNN regression with k = 3
Mean absolute error: 34247.41
Residual sum of squares (MSE): 3093984920.79
Normalized sum of squares error: 7.34%
R2-score: 0.22
KNN regression with k = 4
Mean absolute error: 32990.50
Residual sum of squares (MSE): 3017148726.88
Normalized sum of squares error: 7.16%
R2-score: 0.20
KNN regression with k = 5
Mean absolute error: 33426.02
Residual sum of squares (MSE): 3142498243.32
Normalized sum of squares error: 7.45%
R2-score: 0.11
KNN regression with k = 6
Mean absolute error: 33340.68
Residual sum of squares (MSE): 3147933120.29
Normalized sum of squares error: 7.47%
R2-score: 0.08
KNN regression with k = 7
Mean absolute error: 33547.89
Residual sum o

In [204]:
#MODEL 5: SVM regression

#a) Data preparation 
X_train = train.drop(columns = ['SalePrice'])
y_train = train['SalePrice']
X_test = test.drop(columns = ['SalePrice'])
y_test = test['SalePrice']

#d) Prediction and evaluation
for x in range(1,5):
    y_test_hat = LinearSVR(C=1, epsilon=25000*x, max_iter=100000).fit(X_train,y_train).predict(X_test)
    print('=====================')
    print(f'Support vector regression with epsilon = {25000*x}')
    print('=====================')
    metrics[f'SVM_epsilon={25000*x}'] = eval(y_test_hat, y_test)



Support vector regression with epsilon = 25000
Mean absolute error: 25484.13
Residual sum of squares (MSE): 1513433299.46
Normalized sum of squares error: 3.59%
R2-score: 0.67




Support vector regression with epsilon = 50000
Mean absolute error: 25945.70
Residual sum of squares (MSE): 1437666768.69
Normalized sum of squares error: 3.41%
R2-score: 0.73




Support vector regression with epsilon = 75000
Mean absolute error: 28446.65
Residual sum of squares (MSE): 1620942895.99
Normalized sum of squares error: 3.85%
R2-score: 0.73
Support vector regression with epsilon = 100000
Mean absolute error: 31974.25
Residual sum of squares (MSE): 2008554010.33
Normalized sum of squares error: 4.76%
R2-score: 0.55




In [205]:
#MODEL 6: Decision tree

#a) Data preparation 
X_train = train.drop(columns = ['SalePrice'])
y_train = train['SalePrice']
X_test = test.drop(columns = ['SalePrice'])
y_test = test['SalePrice']

#d) Prediction and evaluation
for x in range(1,10):
    y_test_hat = DecisionTreeRegressor(max_depth = x).fit(X_train,y_train).predict(X_test)
    print('=====================')
    print(f'Decision tree with max_depth = {x}')
    print('=====================')
    metrics[f'Decision_tree_max_depth={x}'] = eval(y_test_hat, y_test)

Decision tree with max_depth = 1
Mean absolute error: 44522.43
Residual sum of squares (MSE): 4279754209.15
Normalized sum of squares error: 10.15%
R2-score: -0.49
Decision tree with max_depth = 2
Mean absolute error: 36167.78
Residual sum of squares (MSE): 2958737522.00
Normalized sum of squares error: 7.02%
R2-score: 0.21
Decision tree with max_depth = 3
Mean absolute error: 32151.01
Residual sum of squares (MSE): 2259665027.34
Normalized sum of squares error: 5.36%
R2-score: 0.46
Decision tree with max_depth = 4
Mean absolute error: 29222.93
Residual sum of squares (MSE): 1967003972.08
Normalized sum of squares error: 4.67%
R2-score: 0.57
Decision tree with max_depth = 5
Mean absolute error: 26945.08
Residual sum of squares (MSE): 1701538392.00
Normalized sum of squares error: 4.04%
R2-score: 0.69
Decision tree with max_depth = 6
Mean absolute error: 25305.99
Residual sum of squares (MSE): 1477304941.02
Normalized sum of squares error: 3.5%
R2-score: 0.75
Decision tree with max_dept

In [225]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)
df = pd.DataFrame.from_dict(metrics, orient='index', dtype=float, columns= ['MAE', 'MSE','relative MSE', 'R2'])
df

Unnamed: 0,MAE,MSE,relative MSE,R2
SLR,39865.08,3435215272.8,0.08,-0.04
MLR,18439.88,872213869.77,0.02,0.85
MLR3,28640.0,1892854037.1,0.04,0.57
KNN_1,39501.98,3672220449.47,0.09,0.29
KNN_2,36329.46,3466038146.86,0.08,0.18
KNN_3,34247.41,3093984920.79,0.07,0.22
KNN_4,32990.5,3017148726.88,0.07,0.2
KNN_5,33426.02,3142498243.32,0.07,0.11
KNN_6,33340.68,3147933120.29,0.07,0.08
KNN_7,33547.89,3231829144.76,0.08,0.06
