In [133]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
from scipy.stats import iqr
from sklearn import metrics
%matplotlib inline

In [None]:
#Functions

# Takes dataframe, intializes two matrices (num_features: number of features and cat_features: feature category)
# returns matrices
def get_cat_num_features(df):
    num_features = []
    cat_features = []
    for col in df.columns:
        if df[col].dtype=="float64" or df[col].dtype=="int64":
            num_features.append(col)
        else:
            cat_features.append(col)
    return num_features, cat_features

# Takes original dataframe (original_dataframe) with identified outlier dataframe (get_outlier_dataframe) 
# and number of columns (dataframe_features)
# rm_outliers boolean allows shortcutting of calling get_outliers command prior 
def remove_outliers(original_dataframe = [], get_outlier_dataframe = [], dataframe_features = [], rm_outliers=False):
    
    if rm_outliers:
        get_outlier_dataframe = get_outliers(original_dataframe,dataframe_features)
    
    for col in get_outlier_dataframe['Feature'].to_list():
        upper = get_outlier_dataframe[get_outlier_dataframe['Feature']== col ]['Upper limit'].values[0]
        lower = get_outlier_dataframe[get_outlier_dataframe['Feature']== col ]['Lower limit'].values[0]
        original_dataframe[col] = np.where(original_dataframe[col]>upper, upper, original_dataframe[col])
        original_dataframe[col] = np.where(original_dataframe[col]<lower, lower, original_dataframe[col])
        
    return original_dataframe


# Takes dataframe and number of columns in datafrae (num_feat)
# Utilizes descriptive statistics (IQR) to identify outliers (1.5 +/- IQR at 25th or 75th percentiles)
# Creates dataframe identifying outlier: column, total outlier count (total), and outliers (upper, lower)
# rm_outlier functions to enable automatic removal of identified outliers via remove_outliers function
def get_outliers(df, num_feat):
    outlier_df = pd.DataFrame(columns=['Feature', 'Total Outliers','Upper limit', 'Lower limit'])
    for col in num_feat:
        lower=np.percentile(df[col],25)-(1.5*iqr(df[col]))
        upper=np.percentile(df[col],75)+(1.5*iqr(df[col]))
        upper_outliers = df[df[col] > upper]
        lower_outliers = df[df[col] < lower]
        total=lower_outliers.shape[0]+upper_outliers.shape[0]
        if (total!=0) and (upper!=0 and lower!=0):
            outlier_df = outlier_df.append({'Feature':col, 'Total Outliers': total,'Upper limit': upper, 'Lower limit':lower}, ignore_index=True)
    
    return outlier_df


In [135]:
data=pd.read_csv('house-prices-advanced-regression-techniques/train.csv',index_col='Id')
data2=pd.read_csv('house-prices-advanced-regression-techniques/test.csv',index_col='Id')
temp=data['SalePrice']
data=data.drop(['SalePrice'],axis=1)
data=pd.concat([data,data2])
data=data.fillna({'Alley':'NA','BsmtQual':'NA','BsmtCond':'NA','BsmtExposure':'NA','BsmtFinType1':'NA','BsmtFinType2':'NA','FireplaceQu':'NA','GarageType':'NA','GarageFinish':'NA','GarageQual':'NA','GarageCond':'NA','PoolQC':'NA','Fence':'NA','MiscFeature':'NA','Exterior2nd':'NA','Functional':'Typ'})
cate=['MSZoning','Exterior1st','MasVnrType','Electrical','KitchenQual','Functional','GarageYrBlt','SaleType']
for item in cate:
    data[item]=data[item].fillna(data[item].value_counts().index[0])
data=data.drop(['Utilities'],axis=1)
data=data.drop(['KitchenAbvGr'],axis=1)
tem=np.where(data['MasVnrType']=='None',0,data['MasVnrArea'])
data['MasVnrArea']=data['MasVnrArea'].fillna(pd.Series(tem))
tem=np.where(data['BsmtFinType1']=='NA',0,data['BsmtFinSF1'])
data['BsmtFinSF1']=data['BsmtFinSF1'].fillna(pd.Series(tem))
tem=np.where(data['BsmtFinType2']=='NA',0,data['BsmtFinSF2'])
data['BsmtFinSF2']=data['BsmtFinSF2'].fillna(pd.Series(tem))
tem=np.where(data['BsmtQual']=='NA',0,data['BsmtUnfSF'])
data['BsmtUnfSF']=data['BsmtUnfSF'].fillna(pd.Series(tem))
tem=np.where(data['BsmtQual']=='NA',0,data['BsmtHalfBath'])
data['BsmtHalfBath']=data['BsmtHalfBath'].fillna(pd.Series(tem))
tem=np.where(data['BsmtQual']=='NA',0,data['BsmtFullBath'])
data['BsmtFullBath']=data['BsmtFullBath'].fillna(pd.Series(tem))
tem=np.where(data['GarageType']=='NA',0,data['GarageCars'])
data['GarageCars']=data['GarageCars'].fillna(pd.Series(tem))
tem=np.where(data['GarageType']=='NA',0,data['GarageArea'])
data['GarageArea']=data['GarageArea'].fillna(pd.Series(tem))
data['LotFrontage']=data.groupby('Neighborhood')["LotFrontage"].transform(lambda x: x.fillna(x.median()))
data['TotalBsmtSF']=(data['BsmtFinSF1']+data['BsmtFinSF2']+data['BsmtUnfSF'])
data['TotalSF']=data['TotalBsmtSF']+data['1stFlrSF']+data['2ndFlrSF']
data['Total_sqr_footage'] = (data['BsmtFinSF1']+data['BsmtFinSF2']+data['1stFlrSF']+data['2ndFlrSF'])
data['Total_Bathrooms'] = (data['FullBath'] + (0.5*data['HalfBath'])+data['BsmtFullBath']+(0.5*data['BsmtHalfBath']))
data['Total_porch_sf'] = (data['OpenPorchSF']+data['3SsnPorch']+data['EnclosedPorch'] + data['ScreenPorch']+data['WoodDeckSF'])
data['MSSubClass'] = data['MSSubClass'].apply(str)
data['OverallCond'] = data['OverallCond'].astype(str)
#data['KitchenAbvGr']=data['KitchenAbvGr'].fillna(data['KitchenAbvGr'].value_counts().index[0])
tem=np.where(data['YrSold']>data['GarageYrBlt'],data['YrSold'],np.where(data['GarageYrBlt']>data['YearRemodAdd'],data['GarageYrBlt'],data['YearRemodAdd']))
data['age']=tem-data['YearBuilt']
data['YearBuilt']=data['YearBuilt'].astype(str)
data['YrSold'] = data['YrSold'].astype(str)
data['MoSold'] = data['MoSold'].astype(str)
od=['MSZoning','LotShape','LandContour','LandSlope','OverallCond','ExterQual','ExterCond','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1', 'BsmtFinType2','HeatingQC','CentralAir','Electrical','KitchenQual','Functional','FireplaceQu','GarageType','GarageFinish','GarageQual', 'GarageCond', 'PavedDrive','PoolQC','Fence','MiscFeature','SaleType','SaleCondition']
dm=['MSSubClass','Street','Alley','LotConfig','Neighborhood','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl', 'Exterior1st', 'Exterior2nd','MasVnrType','Foundation','Heating']
data=pd.get_dummies(data,columns=dm)
from sklearn.preprocessing import LabelEncoder
for col in data.select_dtypes('object').columns:
    ce=['YearBuilt','YrSold','GarageYrBlt','YearRemodAdd']
    if col not in ce:
        if col in od:
            le=LabelEncoder()
            data[col]=le.fit_transform(data[col])
        else:
            continue
    else:
        continue
data2=data.iloc[1460:,:]
data=data.iloc[0:1460,:]
data=data.join(temp)
num_feat,cat_feat=get_cat_num_features(data)
outlier_df = get_outliers(data, num_feat)
data=remove_outliers(data,outlier_df,num_feat)

In [136]:
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
%matplotlib inline
c=data.corr()
print(c['SalePrice'].sort_values().tail(11))
x=data.iloc[:,:-1]
y=data.iloc[:,-1]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

TotalBsmtSF          0.645251
GarageArea           0.660029
Total_Bathrooms      0.666131
GarageCars           0.675896
GrLivArea            0.729311
Total_sqr_footage    0.744944
OverallQual          0.817680
TotalSF              0.827890
SalePrice            1.000000
MSSubClass_150            NaN
Exterior2nd_NA            NaN
Name: SalePrice, dtype: float64


In [137]:
regressor = LinearRegression()
regressor.fit(x_train[['LotArea','OverallQual','OverallCond','YearRemodAdd','BsmtQual','GarageType','GarageCars','TotalSF','Total_sqr_footage','Total_Bathrooms']],y_train/100000)
y_pred=regressor.predict(x_test[['LotArea','OverallQual','OverallCond','YearRemodAdd','BsmtQual','GarageType','GarageCars','TotalSF','Total_sqr_footage','Total_Bathrooms']])
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test/100000, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test/100000, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test/100000, y_pred)))
print('R-squared Error:', metrics.r2_score(y_test/100000, y_pred))
x_2=data2[['LotArea','OverallQual','OverallCond','YearRemodAdd','BsmtQual','GarageType','GarageCars','TotalSF','Total_sqr_footage','Total_Bathrooms']]
y_pred=regressor.predict(x_2)
my_submission = pd.DataFrame({'Id': data2.index, 'SalePrice': (100000*y_pred)})
my_submission.to_csv('submission.csv', index=False)

Mean Absolute Error: 0.19355965260963767
Mean Squared Error: 0.07763445435962789
Root Mean Squared Error: 0.27862960065224207
R-squared Error: 0.8223799693764585


In [138]:
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics
dt = DecisionTreeRegressor()
dt.fit(x_train[['LotArea','OverallQual','OverallCond','YearRemodAdd','BsmtQual','GarageType','GarageCars','TotalSF','Total_sqr_footage','Total_Bathrooms']],y_train/100000)
y_pred=dt.predict(x_test[['LotArea','OverallQual','OverallCond','YearRemodAdd','BsmtQual','GarageType','GarageCars','TotalSF','Total_sqr_footage','Total_Bathrooms']])
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test/100000, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test/100000, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test/100000, y_pred)))
print('R-squared Error:', metrics.r2_score(y_test/100000, y_pred))

Mean Absolute Error: 0.23055684931506853
Mean Squared Error: 0.10509323406284246
Root Mean Squared Error: 0.32418086628122034
R-squared Error: 0.7595569698203961
