# Importing Modules & Reading Data

In [1]:
import pandas as pd 
import random
import numpy as np 
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from sklearn.decomposition import PCA
from sklearn import svm, preprocessing, tree, linear_model
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, BayesianRidge, Lasso, ElasticNet
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor
import csv
from sklearn import metrics

df_train = pd.read_csv("train.csv")
df_test = pd.read_csv('test.csv')
ids = df_test['Id']

y = np.log(df_train['SalePrice'].tolist())		#y is training prices # log is better for this problem
del df_train['SalePrice']

# Preparing the data
Next we merge the training and the testing set, in order to deal with the categorical variables in a uniform way:

In [2]:
df = df_train.append(df_test)

Categorical features that represent a quality can be transformed into numerical values

In [3]:
conds = ['Ex', 'Gd', 'TA', 'Fa', 'Po']
for i in range(len(conds)):
	df.loc[df['ExterQual']==conds[i], 'ExterQual'] = len(conds)-i-1
	df.loc[df['ExterCond']==conds[i], 'ExterCond'] = len(conds)-i-1
	df.loc[df['BsmtQual'] == conds[i], 'BsmtQual'] = len(conds)-i
	df.loc[df['BsmtCond'] == conds[i], 'BsmtCond'] = len(conds)-i
	df.loc[df['HeatingQC']==conds[i], 'HeatingQC'] = len(conds)-i-1
	df.loc[df['KitchenQual']==conds[i], 'KitchenQual'] = len(conds)-i-1
	df.loc[df['FireplaceQu']==conds[i], 'FireplaceQu'] = len(conds)-i
	df.loc[df['GarageQual']==conds[i], 'GarageQual'] = len(conds)-i-1
	df.loc[df['GarageCond']==conds[i], 'GarageCond'] = len(conds)-i-1


I don't care when a remodeling took place, but only if it did take place:

In [4]:
df.loc[df['YearRemodAdd'] != df['YearBuilt'], 'YearRemodAdd'] = 0
df.loc[df['YearRemodAdd'] == df['YearBuilt'], 'YearRemodAdd'] = 1

Fill in missing values in the above qualitative features, either by 0 if the feature is non-existant, or by the mean if the feature just seems to be missing:

In [5]:
df.ExterCond.fillna(df['ExterCond'].mean(), inplace=True)
df.ExterQual.fillna(df['ExterQual'].mean(), inplace=True)
df.BsmtQual.fillna(0, inplace=True)
df.BsmtCond.fillna(0, inplace=True)
df.HeatingQC.fillna(df['HeatingQC'].mean(), inplace=True)
df.KitchenQual.fillna(df['KitchenQual'].mean(), inplace=True)
df.FireplaceQu.fillna(0, inplace=True)
df.GarageQual.fillna(0, inplace=True)
df.GarageCond.fillna(0, inplace=True)

Missing 'LotFrontage' most likely means non-existence, hence set to zero. Translate 'CentralAir' into numerical values.

In [6]:
df.LotFrontage.fillna(0, inplace=True)
df.loc[df['CentralAir'] == 'Y', 'CentralAir'] = 1
df.loc[df['CentralAir'] == 'N', 'CentralAir'] = 0
df.CentralAir.fillna(df['CentralAir'].mean(), inplace=True)

Translate all other categorical features into dummy variables:

In [7]:
columns_cat = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'Foundation', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'Electrical', 'Functional', 'GarageType', 'GarageFinish', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
df = pd.concat([df, pd.get_dummies(df[columns_cat])],axis = 1)
for c in columns_cat:	#delete categorical variables since we have dummies now
	del df[c]

In most cases the garage was built in the same year as the house:

In [8]:
df.loc[pd.isnull(df['GarageYrBlt']), 'GarageYrBlt'] = df.loc[pd.isnull(df['GarageYrBlt']), 'YearBuilt']

Take care of 'MasVnrType'

In [9]:
df.loc[pd.isnull(df['MasVnrType']), 'MasVnrType']  = 'None'
pd.concat([df, pd.get_dummies(df['MasVnrType'])],axis = 1)
del df['MasVnrType']

Very few values are still missing, replace them by the mean of the respective columns:

In [10]:
df = df.fillna(df.mean())

Now we split df back into the training and testing dataframe:

In [None]:
df_train = df.iloc[0:1460,:]
df_test = df.iloc[1460:, :]