In [121]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
from scipy.stats import iqr
from sklearn import metrics
%matplotlib inline

In [126]:
#Functions

# Takes dataframe, intializes two matrices (num_features: number of features and cat_features: feature category)
# returns matrices
def get_cat_num_features(df):
    num_features = []
    cat_features = []
    for col in df.columns:
        if df[col].dtype=="float64" or df[col].dtype=="int64":
            num_features.append(col)
        else:
            cat_features.append(col)
    return num_features, cat_features

# Takes original dataframe (original_dataframe) with identified outlier dataframe (get_outlier_dataframe) 
# and number of columns (dataframe_features)
# rm_outliers boolean allows shortcutting of calling get_outliers command prior 
def remove_outliers(original_dataframe = [], get_outlier_dataframe = [], dataframe_features = [], rm_outliers=False):
    
    if rm_outliers:
        get_outlier_dataframe = get_outliers(original_dataframe,dataframe_features)
    
    for col in get_outlier_dataframe['Feature'].to_list():
        upper = get_outlier_dataframe[get_outlier_dataframe['Feature']== col ]['Upper limit'].values[0]
        lower = get_outlier_dataframe[get_outlier_dataframe['Feature']== col ]['Lower limit'].values[0]
        original_dataframe[col] = np.where(original_dataframe[col]>upper, upper, original_dataframe[col])
        original_dataframe[col] = np.where(original_dataframe[col]<lower, lower, original_dataframe[col])
        
    return original_dataframe


# Takes dataframe and number of columns in datafrae (num_feat)
# Utilizes descriptive statistics (IQR) to identify outliers (1.5 +/- IQR at 25th or 75th percentiles)
# Creates dataframe identifying outlier: column, total outlier count (total), and outliers (upper, lower)
# rm_outlier functions to enable automatic removal of identified outliers via remove_outliers function
def get_outliers(df, num_feat):
    outlier_df = pd.DataFrame(columns=['Feature', 'Total Outliers','Upper limit', 'Lower limit'])
    for col in num_feat:
        lower=np.percentile(df[col],25)-(1.5*iqr(df[col]))
        upper=np.percentile(df[col],75)+(1.5*iqr(df[col]))
        upper_outliers = df[df[col] > upper]
        lower_outliers = df[df[col] < lower]
        total=lower_outliers.shape[0]+upper_outliers.shape[0]
        if (total!=0) and (upper!=0 and lower!=0):
            outlier_df = outlier_df.append({'Feature':col, 'Total Outliers': total,'Upper limit': upper, 'Lower limit':lower}, ignore_index=True)
    
    return outlier_df
    


In [127]:
data=pd.read_csv('house-prices-advanced-regression-techniques/train.csv',index_col='Id')
data2=pd.read_csv('house-prices-advanced-regression-techniques/test.csv',index_col='Id')
temp=data['SalePrice']
data=data.drop(['SalePrice'],axis=1)
data=pd.concat([data,data2])
data=data.fillna({'Alley':'NA','BsmtQual':'NA','BsmtCond':'NA','BsmtExposure':'NA','BsmtFinType1':'NA','BsmtFinType2':'NA','FireplaceQu':'NA','GarageType':'NA','GarageFinish':'NA','GarageQual':'NA','GarageCond':'NA','PoolQC':'NA','Fence':'NA','MiscFeature':'NA','Exterior2nd':'NA','Functional':'Typ'})
cate=['MSZoning','Exterior1st','MasVnrType','Electrical','KitchenQual','Functional','GarageYrBlt','SaleType']
for item in cate:
    data[item]=data[item].fillna(data[item].value_counts().index[0])
data=data.drop(['Utilities'],axis=1)
data=data.drop(['KitchenAbvGr'],axis=1)
tem=np.where(data['MasVnrType']=='None',0,data['MasVnrArea'])
data['MasVnrArea']=data['MasVnrArea'].fillna(pd.Series(tem))
tem=np.where(data['BsmtFinType1']=='NA',0,data['BsmtFinSF1'])
data['BsmtFinSF1']=data['BsmtFinSF1'].fillna(pd.Series(tem))
tem=np.where(data['BsmtFinType2']=='NA',0,data['BsmtFinSF2'])
data['BsmtFinSF2']=data['BsmtFinSF2'].fillna(pd.Series(tem))
tem=np.where(data['BsmtQual']=='NA',0,data['BsmtUnfSF'])
data['BsmtUnfSF']=data['BsmtUnfSF'].fillna(pd.Series(tem))
tem=np.where(data['BsmtQual']=='NA',0,data['BsmtHalfBath'])
data['BsmtHalfBath']=data['BsmtHalfBath'].fillna(pd.Series(tem))
tem=np.where(data['BsmtQual']=='NA',0,data['BsmtFullBath'])
data['BsmtFullBath']=data['BsmtFullBath'].fillna(pd.Series(tem))
tem=np.where(data['GarageType']=='NA',0,data['GarageCars'])
data['GarageCars']=data['GarageCars'].fillna(pd.Series(tem))
tem=np.where(data['GarageType']=='NA',0,data['GarageArea'])
data['GarageArea']=data['GarageArea'].fillna(pd.Series(tem))
data['LotFrontage']=data.groupby('Neighborhood')["LotFrontage"].transform(lambda x: x.fillna(x.median()))
data['TotalBsmtSF']=(data['BsmtFinSF1']+data['BsmtFinSF2']+data['BsmtUnfSF'])
data['TotalSF']=data['TotalBsmtSF']+data['1stFlrSF']+data['2ndFlrSF']
data['Total_sqr_footage'] = (data['BsmtFinSF1']+data['BsmtFinSF2']+data['1stFlrSF']+data['2ndFlrSF'])
data['Total_Bathrooms'] = (data['FullBath'] + (0.5*data['HalfBath'])+data['BsmtFullBath']+(0.5*data['BsmtHalfBath']))
data['Total_porch_sf'] = (data['OpenPorchSF']+data['3SsnPorch']+data['EnclosedPorch'] + data['ScreenPorch']+data['WoodDeckSF'])
data['MSSubClass'] = data['MSSubClass'].apply(str)
data['OverallCond'] = data['OverallCond'].astype(str)
#data['KitchenAbvGr']=data['KitchenAbvGr'].fillna(data['KitchenAbvGr'].value_counts().index[0])
tem=np.where(data['YrSold']>data['GarageYrBlt'],data['YrSold'],np.where(data['GarageYrBlt']>data['YearRemodAdd'],data['GarageYrBlt'],data['YearRemodAdd']))
data['age']=tem-data['YearBuilt']
data['YearBuilt']=data['YearBuilt'].astype(str)
data['YrSold'] = data['YrSold'].astype(str)
data['MoSold'] = data['MoSold'].astype(str)


data2=data.iloc[1460:,:]
data=data.iloc[0:1460,:]
data=data.join(temp)
num_feat,cat_feat=get_cat_num_features(data)
data=remove_outliers(original_dataframe = data,dataframe_features = num_feat, rm_outliers=True)

In [128]:
data

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,LotConfig,LandSlope,...,MoSold,YrSold,SaleType,SaleCondition,TotalSF,Total_sqr_footage,Total_Bathrooms,Total_porch_sf,age,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450.0,Pave,,Reg,Lvl,Inside,Gtl,...,2,2008,WD,Normal,2566.0,2416.0,3.25,61.0,5.0,208500.0
2,20,RL,80.0,9600.0,Pave,,Reg,Lvl,FR2,Gtl,...,5,2007,WD,Normal,2524.0,2240.0,2.50,298.0,31.0,181500.0
3,60,RL,68.0,11250.0,Pave,,IR1,Lvl,Inside,Gtl,...,9,2008,WD,Normal,2706.0,2272.0,3.25,42.0,7.0,223500.0
4,70,RL,60.0,9550.0,Pave,,IR1,Lvl,Corner,Gtl,...,2,2006,WD,Abnorml,2473.0,1933.0,2.00,307.0,91.0,140000.0
5,60,RL,84.0,14260.0,Pave,,IR1,Lvl,FR2,Gtl,...,12,2008,WD,Normal,3343.0,2853.0,3.25,276.0,8.0,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,RL,62.0,7917.0,Pave,,Reg,Lvl,Inside,Gtl,...,8,2007,WD,Normal,2600.0,1647.0,2.50,40.0,8.0,175000.0
1457,20,RL,85.0,13175.0,Pave,,Reg,Lvl,Inside,Gtl,...,2,2010,WD,Normal,3615.0,3026.0,3.00,349.0,32.0,210000.0
1458,70,RL,66.0,9042.0,Pave,,Reg,Lvl,Inside,Gtl,...,5,2010,WD,Normal,3492.0,2615.0,2.00,60.0,69.0,266500.0
1459,20,RL,68.0,9717.0,Pave,,Reg,Lvl,Inside,Gtl,...,4,2010,WD,Normal,2156.0,2156.0,2.00,478.0,60.0,142125.0


In [129]:
temp=data['SalePrice']
data=data.drop(['SalePrice'],axis=1)

for col in data.select_dtypes('object').columns:
    ce=['YearBuilt','YrSold','GarageYrBlt','YearRemodAdd']
    if col not in ce:
        le=LabelEncoder()
        data[col]=le.fit_transform(data[col])
        data2[col]=le.fit_transform(data2[col])
    else:
        continue
data=data.join(temp)

In [130]:
c=data.corr()
print(c['SalePrice'].sort_values().tail(11))
x=data.iloc[:,:-1]
y=data.iloc[:,-1]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

FullBath             0.583994
1stFlrSF             0.621873
TotalBsmtSF          0.645251
GarageArea           0.660029
Total_Bathrooms      0.666131
GarageCars           0.675896
GrLivArea            0.729311
Total_sqr_footage    0.744944
OverallQual          0.817680
TotalSF              0.827890
SalePrice            1.000000
Name: SalePrice, dtype: float64


In [131]:
regressor = LinearRegression()
regressor.fit(x_train[['LotArea','Neighborhood','OverallQual','YearRemodAdd','BsmtQual','GarageType','GarageCars','TotalSF','Total_sqr_footage','Total_Bathrooms']],y_train/100000)
y_pred=regressor.predict(x_test[['LotArea','Neighborhood','OverallQual','YearRemodAdd','BsmtQual','GarageType','GarageCars','TotalSF','Total_sqr_footage','Total_Bathrooms']])
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test/100000, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test/100000, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test/100000, y_pred)))
print('R-squared Error:', metrics.r2_score(y_test/100000, y_pred))
x_2=data2[['LotArea','Neighborhood','OverallQual','YearRemodAdd','BsmtQual','GarageType','GarageCars','TotalSF','Total_sqr_footage','Total_Bathrooms']]
y_pred=regressor.predict(x_2)
my_submission = pd.DataFrame({'Id': data2.index, 'SalePrice': (100000*y_pred)})
my_submission.to_csv('submission.csv', index=False)

Mean Absolute Error: 0.19403382930484064
Mean Squared Error: 0.07864489215516235
Root Mean Squared Error: 0.2804369664562116
R-squared Error: 0.8200681866291408


In [132]:

dt = DecisionTreeRegressor()
dt.fit(x_train[['LotArea','Neighborhood','OverallQual','YearRemodAdd','BsmtQual','GarageType','GarageCars','TotalSF','Total_sqr_footage','Total_Bathrooms']],y_train/100000)
y_pred=dt.predict(x_test[['LotArea','Neighborhood','OverallQual','YearRemodAdd','BsmtQual','GarageType','GarageCars','TotalSF','Total_sqr_footage','Total_Bathrooms']])
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test/100000, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test/100000, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test/100000, y_pred)))
print('R-squared Error:', metrics.r2_score(y_test/100000, y_pred))

Mean Absolute Error: 0.2220310616438356
Mean Squared Error: 0.10694408693989726
Root Mean Squared Error: 0.3270230679017877
R-squared Error: 0.7553223996490226
