In [38]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.stats import iqr
from sklearn.preprocessing import LabelEncoder

In [39]:
def get_cat_num_features(df):
    num_features = []
    cat_features = []
    for col in df.columns:
        if df[col].dtype=="float64" or df[col].dtype=="int64":
            num_features.append(col)
        else:
            cat_features.append(col)
    return num_features, cat_features
def get_outliers(df, num_feat):
    outlier_df = pd.DataFrame(columns=['Feature', 'Total Outliers','Upper limit', 'Lower limit'])
    for col in num_feat:
        lower=np.percentile(df[col],25)-(1.5*iqr(df[col]))
        upper=np.percentile(df[col],75)+(1.5*iqr(df[col]))
        upper_outliers = df[df[col] > upper]
        lower_outliers = df[df[col] < lower]
        total=lower_outliers.shape[0]+upper_outliers.shape[0]
        if (total!=0) and (upper!=0 and lower!=0):
            outlier_df = outlier_df.append({'Feature':col, 'Total Outliers': total,'Upper limit': upper, 'Lower limit':lower}, ignore_index=True)
    return outlier_df
def remove_outliers(df, outlier_df, num_feat):
    for col in outlier_df['Feature'].to_list():
        upper = outlier_df[outlier_df['Feature']== col ]['Upper limit'].values[0]
        lower = outlier_df[outlier_df['Feature']== col ]['Lower limit'].values[0]
        df[col] = np.where(df[col]>upper, upper, df[col])
        df[col] = np.where(df[col]<lower, lower, df[col])
    return df

In [40]:
data=pd.read_csv('house-prices-advanced-regression-techniques/train.csv',index_col='Id')
data2=pd.read_csv('house-prices-advanced-regression-techniques/test.csv',index_col='Id')
temp=data['SalePrice']
data=data.drop(['SalePrice'],axis=1)
data=pd.concat([data,data2])
data=data.fillna({'Alley':'NA','BsmtQual':'NA','BsmtCond':'NA','BsmtExposure':'NA','BsmtFinType1':'NA','BsmtFinType2':'NA','FireplaceQu':'NA','GarageType':'NA','GarageFinish':'NA','GarageQual':'NA','GarageCond':'NA','PoolQC':'NA','Fence':'NA','MiscFeature':'NA','Exterior2nd':'NA','Functional':'Typ'})
cate=['MSZoning','Exterior1st','MasVnrType','Electrical','KitchenQual','Functional','GarageYrBlt','SaleType']
for item in cate:
    data[item]=data[item].fillna(data[item].value_counts().index[0])
data=data.drop(['Utilities'],axis=1)
data=data.drop(['KitchenAbvGr'],axis=1)
tem=np.where(data['MasVnrType']=='None',0,data['MasVnrArea'])
data['MasVnrArea']=data['MasVnrArea'].fillna(pd.Series(tem))
tem=np.where(data['BsmtFinType1']=='NA',0,data['BsmtFinSF1'])
data['BsmtFinSF1']=data['BsmtFinSF1'].fillna(pd.Series(tem))
tem=np.where(data['BsmtFinType2']=='NA',0,data['BsmtFinSF2'])
data['BsmtFinSF2']=data['BsmtFinSF2'].fillna(pd.Series(tem))
tem=np.where(data['BsmtQual']=='NA',0,data['BsmtUnfSF'])
data['BsmtUnfSF']=data['BsmtUnfSF'].fillna(pd.Series(tem))
tem=np.where(data['BsmtQual']=='NA',0,data['BsmtHalfBath'])
data['BsmtHalfBath']=data['BsmtHalfBath'].fillna(pd.Series(tem))
tem=np.where(data['BsmtQual']=='NA',0,data['BsmtFullBath'])
data['BsmtFullBath']=data['BsmtFullBath'].fillna(pd.Series(tem))
tem=np.where(data['GarageType']=='NA',0,data['GarageCars'])
data['GarageCars']=data['GarageCars'].fillna(pd.Series(tem))
tem=np.where(data['GarageType']=='NA',0,data['GarageArea'])
data['GarageArea']=data['GarageArea'].fillna(pd.Series(tem))
data['LotFrontage']=data.groupby('Neighborhood')["LotFrontage"].transform(lambda x: x.fillna(x.median()))
data['TotalBsmtSF']=(data['BsmtFinSF1']+data['BsmtFinSF2']+data['BsmtUnfSF'])
data['TotalSF']=data['TotalBsmtSF']+data['1stFlrSF']+data['2ndFlrSF']
data['Total_sqr_footage'] = (data['BsmtFinSF1']+data['BsmtFinSF2']+data['1stFlrSF']+data['2ndFlrSF'])
data['Total_Bathrooms'] = (data['FullBath'] + (0.5*data['HalfBath'])+data['BsmtFullBath']+(0.5*data['BsmtHalfBath']))
data['Total_porch_sf'] = (data['OpenPorchSF']+data['3SsnPorch']+data['EnclosedPorch'] + data['ScreenPorch']+data['WoodDeckSF'])
data=data.drop(columns=['BsmtFinSF1','BsmtFinSF2','1stFlrSF','2ndFlrSF','FullBath','HalfBath','BsmtFullBath','BsmtHalfBath','OpenPorchSF','3SsnPorch','EnclosedPorch','ScreenPorch','WoodDeckSF'])
data['MSSubClass'] = data['MSSubClass'].apply(str)
data['OverallCond'] = data['OverallCond'].astype(str)
tem=np.where(data['YrSold']>data['GarageYrBlt'],data['YrSold'],np.where(data['GarageYrBlt']>data['YearRemodAdd'],data['GarageYrBlt'],data['YearRemodAdd']))
data['age']=tem-data['YearBuilt']
data['YearRemodAdd']=data['YearRemodAdd'].astype(str)
data['YearBuilt']=data['YearBuilt'].astype(str)
data['GarageYrBlt']=data['GarageYrBlt'].astype(str)
data['YrSold'] = data['YrSold'].astype(str)
data['MoSold'] = data['MoSold'].astype(str)
od=['LotShape','LandContour','LandSlope','OverallCond','ExterQual','ExterCond','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1', 'BsmtFinType2','HeatingQC','CentralAir','Electrical','KitchenQual','Functional','FireplaceQu','GarageType','GarageFinish','GarageQual', 'GarageCond', 'PavedDrive','PoolQC','Fence','MiscFeature','SaleType','SaleCondition']
dm=['MSSubClass','MSZoning','Street','Alley','LotConfig','Neighborhood','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl', 'Exterior1st', 'Exterior2nd','MasVnrType','Foundation','Heating','YearBuilt','YrSold','GarageYrBlt','YearRemodAdd']
num_feat,_=get_cat_num_features(data)
data=pd.get_dummies(data,columns=dm)
from sklearn.preprocessing import LabelEncoder
for col in data.select_dtypes('object').columns:
    if col in od:
        le=LabelEncoder()
        data[col]=le.fit_transform(data[col])
    else:
        continue
outlier_df = get_outliers(data, num_feat)
data=remove_outliers(data,outlier_df,num_feat)
data2=data.iloc[1460:,:]
data=data.iloc[0:1460,:]
data=data.join(temp)
outlier_df
data.head()
data.describe()

Unnamed: 0,LotFrontage,LotArea,LotShape,LandContour,LandSlope,OverallQual,OverallCond,MasVnrArea,ExterQual,ExterCond,...,YearRemodAdd_2002,YearRemodAdd_2003,YearRemodAdd_2004,YearRemodAdd_2005,YearRemodAdd_2006,YearRemodAdd_2007,YearRemodAdd_2008,YearRemodAdd_2009,YearRemodAdd_2010,SalePrice
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,69.468151,9648.871233,1.942466,2.777397,0.062329,6.100685,4.575342,89.795205,2.539726,3.733562,...,0.032877,0.034932,0.042466,0.05,0.066438,0.052055,0.027397,0.015753,0.00411,180921.19589
std,18.024406,3598.128683,1.409156,0.707666,0.276232,1.37843,1.112799,134.181252,0.693995,0.731807,...,0.178375,0.183669,0.201718,0.21802,0.249132,0.222214,0.163294,0.124563,0.063996,79442.502883
min,30.0,1340.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34900.0
25%,60.0,7553.5,0.0,3.0,0.0,5.0,4.0,0.0,2.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,129975.0
50%,70.0,9478.5,3.0,3.0,0.0,6.0,4.0,0.0,3.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,163000.0
75%,80.0,11601.5,3.0,3.0,0.0,7.0,5.0,166.0,3.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,214000.0
max,110.0,17708.0,3.0,3.0,2.0,10.0,8.0,410.0,3.0,4.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,755000.0


In [41]:
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
%matplotlib inline
c=data.corr()
print(c[c['SalePrice']>=0.49]['SalePrice'].sort_values().tail(11).index)
x=data.iloc[:,:-1]
y=data.iloc[:,-1]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

Index(['Foundation_PConc', 'TotRmsAbvGrd', 'GarageArea', 'Total_Bathrooms',
       'TotalBsmtSF', 'GarageCars', 'GrLivArea', 'Total_sqr_footage',
       'OverallQual', 'TotalSF', 'SalePrice'],
      dtype='object')


In [42]:
regressor = LinearRegression()
regressor.fit(x_train[['Foundation_PConc', 'TotRmsAbvGrd', 'GarageArea', 'Total_Bathrooms','TotalBsmtSF', 'GarageCars', 'GrLivArea', 'Total_sqr_footage','OverallQual', 'TotalSF']],y_train/100000)
y_pred=regressor.predict(x_test[['Foundation_PConc', 'TotRmsAbvGrd', 'GarageArea', 'Total_Bathrooms','TotalBsmtSF', 'GarageCars', 'GrLivArea', 'Total_sqr_footage','OverallQual', 'TotalSF']])
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test/100000, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test/100000, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test/100000, y_pred)))
print('R-squared Error:', metrics.r2_score(y_test/100000, y_pred))
x_2=data2[['Foundation_PConc', 'TotRmsAbvGrd', 'GarageArea', 'Total_Bathrooms','TotalBsmtSF', 'GarageCars', 'GrLivArea', 'Total_sqr_footage','OverallQual', 'TotalSF']]
y_pred=regressor.predict(x_2)
print(y_pred)
my_submission = pd.DataFrame({'Id': data2.index, 'SalePrice': 100000*y_pred})
my_submission.to_csv('submission.csv', index=False)

Mean Absolute Error: 0.2463773599860386
Mean Squared Error: 0.1695312791401654
Root Mean Squared Error: 0.4117417626864749
R-squared Error: 0.77897774545584
[1.15920169 1.70868998 1.80509885 ... 1.65649593 1.05731567 2.53245366]


In [43]:
regressor = LinearRegression()
regressor.fit(x_train[['Foundation_PConc', 'TotRmsAbvGrd', 'GarageArea', 'Total_Bathrooms','TotalBsmtSF', 'GarageCars', 'GrLivArea', 'Total_sqr_footage','OverallQual', 'TotalSF']],np.log(y_train/100000))
y_pred=regressor.predict(x_test[['Foundation_PConc', 'TotRmsAbvGrd', 'GarageArea', 'Total_Bathrooms','TotalBsmtSF', 'GarageCars', 'GrLivArea', 'Total_sqr_footage','OverallQual', 'TotalSF']])
print('Mean Absolute Error:', metrics.mean_absolute_error(np.log(y_test/100000), y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(np.log(y_test/100000), y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(np.log(y_test/100000), y_pred)))
print('R-squared Error:', metrics.r2_score(np.log(y_test/100000), y_pred))
x_2=data2[['Foundation_PConc', 'TotRmsAbvGrd', 'GarageArea', 'Total_Bathrooms','TotalBsmtSF', 'GarageCars', 'GrLivArea', 'Total_sqr_footage','OverallQual', 'TotalSF']]
y_pred=regressor.predict(x_2)
print(y_pred)
my_submission = pd.DataFrame({'Id': data2.index, 'SalePrice': 100000*np.exp(y_pred)})
my_submission.to_csv('submission.csv', index=False)

Mean Absolute Error: 0.11825479380459407
Mean Squared Error: 0.028280509094734094
Root Mean Squared Error: 0.16816809773180552
R-squared Error: 0.8484541240084942
[0.13641684 0.41714337 0.52842757 ... 0.41825464 0.09191635 0.89494079]
