In [10]:
import numpy as np # linear algebra
import pandas as pd # data processing
import re # pattern searching
import seaborn as sns # plots
import matplotlib as mpl # figure artists, attributes
import matplotlib.pyplot as plt # figures, axes
from matplotlib.patches import Patch # artist manipulation
from scipy import stats # distributions/quantile plots
from datetime import datetime as dt #datetime manipulation
%matplotlib inline

from IPython.core.display import HTML
HTML("""
<style>
.output_png {
    display: table-cell;
    text-align: center;
    vertical-align: middle;
}
</style>
""")

# **First view of the data**

<span style="font-size:18px">
    An overview of briefly conveys the features, corresponding datatypes, scale/range of values, missing values etc. Its helpful in planning the data preprocessing techniques
</span>

**Note: If reader is not viewing this notebook on kaggle, then downloading the datasets along with notebook is recommended. File paths to training and testing datasets should be modified accordingly**

In [11]:
train_path = '../input/house-prices-advanced-regression-techniques/train.csv' # file path for training data
test_path = '../input/house-prices-advanced-regression-techniques/test.csv' #file path for testing data


pd.set_option('display.max_columns', 100)
df_train = pd.read_csv(train_path, index_col='Id')
df_test = pd.read_csv(test_path, index_col='Id')
SalePrice = df_train['SalePrice']
df_train.drop('SalePrice', axis=1, inplace=True, errors='ignore')
df_train.head(20)

# **A DataFrame with column info and description**
<span style="font-size:18px">
             The DataFrame with column names, column descriptions, column datatype, column missing values should can be created to inspect the incorrect datatypes and missing values
</span>

In [12]:
txt = open('../input/house-prices-advanced-regression-techniques/data_description.txt', 'r')
lines = txt.readlines()

description=[]
row=None
for line in lines:
    if len(re.findall(r"^\S.*:.*", line, re.DOTALL)) is not 0:
        if row is None:
            row=line
        else:
            description.append(row)
            row=line
    else:
        if line is lines[-1]:
            row+=line
            description.append(row)
        else:
            row+=line
            
df_info = pd.DataFrame()
df_info['dtype'] = df_train.dtypes
df_info['nullValues'] = df_train.isnull().sum()
df_info['naValues'] = df_train.isna().sum()
df_info['columnDesc'] = description
df_info.to_csv('column_desc.csv')
df_info.head(20)

# **Changing  Datatypes**
<span style="font-size:18px;">
    Some features have incorrect datatypes and lot of features have missing values. Feature set needs dtype conversion and imputation before it can be explored to derive insights
</span>

In [13]:
##converting the numerical columns into object columns
df_train['GarageYrBlt'] = df_train['GarageYrBlt'].fillna(0).astype('int64').astype('object')
df_train['MSSubClass'] = df_train['MSSubClass'].astype('object')
df_train['YearBuilt'] = df_train['YearBuilt'].astype('object', copy=False)
df_train['YearRemodAdd'] = df_train['YearRemodAdd'].astype('object', copy=False)
df_train['MoSold'] = df_train['MoSold'].astype('object', copy=False)
df_train['YrSold'] = df_train['YrSold'].astype('object', copy=False)
df_train.info()

# **Primary Imputation**
<p style="font-size:18px">
    Simple logical assumptions should be first approach to impute the missing data.
</p>
<ol style="font-size:18px; line-height:200%">
    <li>MS Zoning: It can be fairly assume that Houses in the same residential zone will possibly have same housing subclass and neighborhood. Based MSSubClass and Neighborhood of the record (with missing MSZoning value), the missing values will be replaced with/imputed as mode value for that combination of MSSubClass and Neighborhood e.g. For a record with missing MSZoning if MSSubClass and Neighborhood values are 150 and 'North Ames' respectively, the mode MSZoning value for all the records with the same pair of MSSubClass and Neighborhood values, Residential Low Density in this case, will be used for imputation.</li>
    <li>LotFrontage: Similar strategy can be used for LotFrontage as well. For missing LotFrontage, median LotFrontage of all records with same MSZoning value will be imputed</li>
    <li>Masonry Veneer Area and Type: No logical assumption can be leveraged for this feature. A simply imputation of missing Masonry Veneer Area value with 0 and missing Masonry Veneer Type value with None will be used here</li>
    <li>Basement features: There are lot of basement features missing where the basement area is zero hence imputing these missing features as 'NA' (categorical) or '0'(numerical) will be the right approach</li>
    <li>Garage features: There are lot of basement features missing where the garage area is zero and hence imputing these missing features as 'NA' (categorical) or '0'(numerical) will be the right approach</li>
    <li>Utilities: All but one value is missing. And majority of the values are 'AllPub' (2916 out of 2919).'AllPub' will be used inplace of missing values</li>
    <li>Exterior features (1st and 2nd): Mode Exterior features corresponding to same MSSubClass and Neighborhood will be used for imputation</li>
    <li>Electrical, Functional, Kitchen features: Absolute mode values to impute the missing values seems the right way</li>  
    <li>Fireplaces, Pool and Miscellenious features: Where fireplace/pool area/misc feature value is zero, the corresponding categorical features will be imputed as 'NA'. There are records where pool area is not zero but pool quality is missing. In that case mode pool quality value will be used. There are records where miscellenious feature value is non zero but the miscellenious feature value is missing. In that case we the 'Othr' value will be used for imputation</li>
    <li>SaleType: Absolute mode value will be used to impute the missing values</li>  
    <li>Alley and Fence: Most of the records have missing values for these two feature. These columns will be discarded from further preprocessing</li>

</ol>


In [14]:
##MSZoning: Mode MSZoning values sharing same MSSubclass and Neighborhood 
df_train['MSZoning']=df_train.groupby(['MSSubClass', 'Neighborhood'])['MSZoning'].transform(lambda x:x.fillna(x.mode()[0]))

##Lot Features: For missing LotFrontage we use median LotFrontage values of same MSZoning properties
df_train['LotFrontage']=df_train.groupby("MSZoning")['LotFrontage'].transform(lambda x:x.fillna(x.median()))

##Masonry Veneer Features: There is no logical way to impute the Masonry Veneer type and area. The best way will be to be conservative and assume there is no Masonry Veneer
df_train['MasVnrArea'].where(df_train['MasVnrArea'].notna(), 0, inplace=True)
df_train['MasVnrType'].where(df_train['MasVnrType'].notna(), "None", inplace=True)

##Basement Feautres: Whereever the total basement area is zero and other basement features are missing we can impute "NA" values
df_train['BsmtQual'].where(df_train['TotalBsmtSF']!=0, "NA", inplace=True)
df_train['BsmtCond'].where(df_train['TotalBsmtSF']!=0, "NA", inplace=True)
df_train['BsmtExposure'].where(df_train['TotalBsmtSF']!=0, "NA", inplace=True)
df_train['BsmtFinType1'].where(df_train['TotalBsmtSF']!=0, "NA", inplace=True)
df_train['BsmtFinType2'].where(df_train['TotalBsmtSF']!=0, "NA", inplace=True)
df_train['BsmtFullBath'].where(df_train['TotalBsmtSF']!=0, 0.0, inplace=True)
df_train['BsmtHalfBath'].where(df_train['TotalBsmtSF']!=0, 0.0, inplace=True)
df_train['BsmtFinSF1'].where(df_train['TotalBsmtSF']!=0, 0.0, inplace=True)
df_train['BsmtFinSF2'].where(df_train['TotalBsmtSF']!=0, 0.0, inplace=True)
df_train['BsmtUnfSF'].where(df_train['TotalBsmtSF']!=0, 0.0, inplace=True)

##BsmtExposure and BsmtFinTyoe2 still have missing values where the basement area is non zero. In that case we impute the mode values
df_train['BsmtExposure']=df_train['BsmtExposure'].transform(lambda x: x.fillna(x.mode()[0]))
df_train['BsmtFinType2']=df_train['BsmtFinType2'].transform(lambda x: x.fillna(x.mode()[0]))

##Garage Features: Whereever the garage area is zero and other garage features are missing we can impute "NA" values
df_train['GarageCond'].where(df_train['GarageArea']!=0, "NA", inplace=True)
df_train['GarageQual'].where(df_train['GarageArea']!=0, "NA", inplace=True)
df_train['GarageFinish'].where(df_train['GarageArea']!=0, "NA", inplace=True)
df_train['GarageType'].where(df_train['GarageArea']!=0, "NA", inplace=True)

##Electrical
df_train['Electrical']=df_train['Electrical'].fillna(df_train['Electrical'].mode()[0])

##Fireplace Features
df_train['FireplaceQu'].where(df_train['Fireplaces']!=0, "NA", inplace=True)

##Pool Features
df_train['PoolQC'].where(df_train['PoolArea']!=0, "NA", inplace=True)

##Miscellaneous Features
df_train['MiscFeature'].where(df_train['MiscVal']!=0, "NA", inplace=True)

##Too many missing values in Alley and Fence feature and no logical step to impute. We will drop them. 
df_train.drop(axis=1, columns=['Alley', 'Fence'], errors='ignore', inplace=True)

##Missing values after primary imputation
df_train.info()

<span style="font-size:18px">
    Some categorical columns like Fireplace, Garage etc will appear better as a binary columns since what is more important is presence or absence of those amenities than a qualitative/quantitative value of amenities present.
</span>

In [15]:
##Creating boolean columns
df_train['HasFireplace'] = np.where(df_train['Fireplaces']==0, 'No', 'Yes')
df_train['HasGarage'] = np.where(df_train['GarageCars']==0, 'No', 'Yes')
df_train['HasBasement'] = np.where(df_train['BsmtQual']=='NA', 'No', 'Yes')
df_train['HasPool'] = np.where(df_train['PoolQC']=='NA', 'No', 'Yes')
df_train['HasAmenities']=np.where(df_train['MiscFeature']=='NA', 'No', 'Yes')

df_train['MoYrSold']= df_train['MoSold'].astype(str) +'-'+ df_train['YrSold'].astype(str)
df_train['MoYrSold'] = pd.to_datetime(df_train['MoYrSold'])
df_train['YearBuilt'] = pd.to_datetime(df_train['YearBuilt'], format="%Y")
df_train['YearRemodAdd'] = pd.to_datetime(df_train['YearRemodAdd'], format="%Y")
df_train['SalePrice']=SalePrice
df_train.head(10)

<p style="font-size:18px">
    Now the data is ready to be explored.
</p>
<p style="font-size:18px">
    The time period of sales spans from January 2006 to July 2010 (fig 1a). The month-to-month (MoM) sales chart shows a pattern in the sales count after certain time periods. The annually decomposition of MoM chart confirms this fact; it shows strong annual seasonality (fig 1b). The monthly sales trend in this dataset roughly follows the national trend of March to July having highest sales and November, January and February having lowest.
</p>

In [16]:
fig=plt.figure(figsize=(15,12), layout='tight', dpi=100)
grid=fig.add_gridspec(3,3)
ax1=fig.add_subplot(grid[0,:])
data=pd.crosstab(df_train['MoYrSold'], columns=1)
sns.lineplot(x=data.index,y=data[1], ax=ax1);
xticklabels=['Jan 06', 'Jul 06', 'Jan 07', 'Jul 07', 'Jan 08', 'Jul 08', 'Jan 09', 'Jul 09', 'Jan 10', 'Jul 10']
yticklabels=[20,40,60]
ax1.set_ylim(0,70)
ax1.set_xticks(ticks=ax1.get_xticks(),labels=xticklabels, fontsize='x-large');
ax1.set_yticks(ticks=yticklabels, labels=yticklabels,fontsize='x-large');
ax1.set_xlabel('Month (MoM)', fontsize='xx-large', fontweight='bold');
ax1.set_ylabel('Sales Count', fontsize='xx-large', fontweight='bold');

ax1.text(-0.05,-0.15, r'(a)',transform=ax1.transAxes, fontsize='xx-large');


plt.rcParams['legend.title_fontsize'] = 'x-large'
ax2=fig.add_subplot(grid[1:,:])
data=pd.crosstab(df_train['MoSold'], columns=df_train['YrSold'])
sns.lineplot(data=data, palette=sns.color_palette("Set1", n_colors=5), dashes=False, ax=ax2);
xticks=[i for i in range(1,13)]
xticklabels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
yticks=[20,40,60]
ax2.set_ylim(0,70)
ax2.set_xticks(ticks=xticks,labels=xticklabels, fontsize='xx-large');
ax2.set_yticks(ticks=yticks, labels=yticks,fontsize='xx-large');
ax2.set_xlabel('Month', fontsize='xx-large', fontweight='bold');
ax2.set_ylabel('Sales Count', fontsize='xx-large', fontweight='bold');
ax2.legend(title='Year Sold', fontsize='xx-large');
ax2.text(-0.05,-0.1, r'(b)',transform=ax2.transAxes, fontsize='xx-large');


#fig.savefig('B1.jpeg', dpi=100, bbox_inches='tight')

<center style="font-size:15px;font-style:italics">
    <b><i>
    Figure 1: a. Month-on-Month time series of sales count b. Seasonal decomposition of Month-on-Month sales count
    </i></b>    
</center>

<span style="font-size:18px">
    Most of the sales were witnessed by very few neighbourhoods, mainly North Ames, College Creek, Old Town and Edwards followed by Somerset, Gilbert and Northridge Heights (fig 2)
</span>

In [19]:
order=np.sort(df_train['Neighborhood'].unique())
fig = sns.catplot(data=df_train, x='YrSold', kind='count', col='Neighborhood', col_wrap=5, col_order=order,color=(0.8,0.3,0.3), height=3, aspect=0.9);
titles=['Bloomington \nHeights',
        'Bluestem', 
        'Briardale',
        'Brookside',
        'Clear\nCreek',
        'College\nCreek',
        'Crawford',
        'Edwards',
        'Gilbert',
        'IOWA DOT &\nRail Road',
        'Meadow\nVillage',
        'Mitchell',
        'North\nAmes',
        'Northridge',
        'Northpark\nVilla',
        'Northridge\nHeights',
        'Northwest\nAmes',
        'Old Town',
        'South&West of\nIowa State\nUniversity',
        'Sawyer',
        'Sawyer\nWest',
        'Somerset',
        'Stone\nBrook',
        'Timberland',
        'Veenker']
xticklabels=['\'06', '\'07', '\'08', '\'09', '\'10']
yticks=[0,20,40,60]
for i in fig.axes:
    i.tick_params(axis='both', which='both', bottom=False, left=False, labelleft=False)
    i.set_xticks(i.get_xticks(), labels=xticklabels, fontsize='xx-large')
    for j in i.containers:
        i.bar_label(j, fontsize='xx-large')
    i.set_ylim(0,100)
    #i.set_xticks(ticks=xticks, labels=xticklabels, fontsize='xx-large')
    i.set_xlabel("Year Sold", fontsize='xx-large', fontstretch='semi-expanded', fontweight='bold');
    i.set_ylabel("Count", fontsize='xx-large', fontweight='bold');
    i.set_title(titles[list(fig.axes).index(i)],fontsize='xx-large', fontweight='bold', x=0.5, y=0.7);

#fig.savefig('B2.jpeg', dpi=1200, bbox_inches='tight')

<center style="font-size:15px;font-style:italics">
    <b><i>
    Figure 2 : Histogram of annual sales decomposed by neighbourhood
    </i></b>    
</center>

<p style="font-size:18px">
    Ames, with population of 66,427 according to 2020 census, is more of a college town than a metropolitan area with very few industries/ institutions other than Iowa State University, Ames National Lab, USDA research centres [2][3]. These facts were also reflected in types of houses featured in the dataset. Out of 1460 houses, 1220 were single family detached and 157 were townhouse (fig 3a). Very few houses had more than 2 levels (fig 3b). More than half of the houses were one story and 3 out of every 4 houses were either one story or two story. More than 75 percent of the houses were in low-density residential zones and only around 15 percent of them were in medium or high-density residential zones (fig 3c). Only 10 properties were in commercial zone.
</p>

In [8]:
fig, ax = plt.subplots(figsize=(22,9),ncols=3, nrows=1)


## Building Type
palette = sns.color_palette('Set2', n_colors=len(df_train['BldgType'].unique()))
handles=[mpl.patches.Patch(color=i) for i in palette]
labels = ['Single family detached', 
          'Two family converted', 
          'Duplex', 
          'Townhouse end unit', 
          'Townhouse inside unit']
ax[0].tick_params(axis='both', which='both', bottom=False, left=False, labelbottom=False, labelleft=False)
sns.countplot(data=df_train, x='BldgType', palette=palette, ax=ax[0]);
for i in ax[0].containers:
    ax[0].bar_label(i, fontsize='xx-large', fontweight='bold');

ax[0].set_xlabel('Building Type',  fontsize='xx-large', fontweight='bold')
ax[0].set_ylabel('Count',  fontsize='xx-large', fontweight='bold')
ax[0].set_ylim(0,1400)
ax[0].legend(handles=handles, labels=labels, fontsize='xx-large');
ax[0].text(0,-0.05, r'(a)',transform=ax[0].transAxes, fontsize='xx-large');


## House Style
palette = sns.color_palette('Set2', n_colors=len(df_train['HouseStyle'].unique()))
handles=[mpl.patches.Patch(color=i) for i in palette]
labels = ['Two story', 
          'One story', 
          'One and half story \n2nd level finished', 
          'One and half story \n2nd level unfinished', 
          'Split Foyer', 
          'Split Level',
          'Two and half story \n2nd level unfinished', 
          'Two and half story \n2nd level finished']
ax[1].tick_params(axis='both', which='both', bottom=False, left=False, labelbottom=False, labelleft=False)
sns.countplot(data=df_train, x='HouseStyle', palette=palette, ax=ax[1]);
for i in ax[1].containers:
    ax[1].bar_label(i, fontsize='xx-large', fontweight='bold');

ax[1].set_xlabel('House Style',  fontsize='xx-large', fontweight='bold')
ax[1].set_ylabel('Count',  fontsize='xx-large', fontweight='bold')
ax[1].set_ylim(0,900)
ax[1].legend(handles=handles, labels=labels, fontsize='xx-large');
ax[1].text(0,-0.05, r'(b)',transform=ax[1].transAxes, fontsize='xx-large');


## Housing Zone
palette = sns.color_palette('Set2', n_colors=len(df_train['BldgType'].unique()))
handles=[mpl.patches.Patch(color=i) for i in palette]
labels=['Residential \nLow Density',
        'Residential \nMedium Density',
        'Commercial',
        'Floating Village \nResidential',
        'Residential \nHigh Density']
ax[2].tick_params(axis='both', which='both', bottom=False, left=False, labelbottom=False, labelleft=False)
sns.countplot(data=df_train, x='MSZoning', palette=palette, ax=ax[2]);
for i in ax[2].containers:
    ax[2].bar_label(i, fontsize='xx-large', fontweight='bold');

ax[2].set_xlabel('Housing Zone',  fontsize='xx-large', fontweight='bold')
ax[2].set_ylabel('Count',  fontsize='xx-large', fontweight='bold')
ax[2].set_ylim(0,1400)
ax[2].legend(handles=handles, labels=labels, fontsize='xx-large');
ax[2].text(0,-0.05, r'(c)',transform=ax[2].transAxes, fontsize='xx-large');

#fig.savefig('B3.jpeg', dpi=100, bbox_inches='tight')

<center style="font-size:15px;font-style:italics">
    <b><i>
    Figure 3 : Count plot of sales across a. building types b. House Style c. Housing Zone
    </i></b>    
</center>

<p style="font-size:18px">
    For most of the neighbourhoods the annual sales count was below 20 throughout the five-year period. Almost 90 percent of the properties had near flat land contour (fig 4a). The topographical map of Ames shows some hilly region with slight depression at central and southern parts of city [4]. This can be attributed to around 10 percent of the houses having either banked, hillside or depressed contour. More than 96 percent of the property lots had either regular shape or were slightly irregular (fig 4b). Less than a percent of lots were completely irregular (fig 4c). More than 70 percent property lots had inside locality, around 18 percent had corner locality and less than 4 percent had frontage exposure to 2 or 3 sides. Almost all the property had access to paved street (fig 4d).
</p>

In [20]:
fig, ax = plt.subplots(figsize=(16,16), ncols=2, nrows=2)

##Contour piechart
data=df_train["LandContour"].value_counts().values
labels=df_train["LandContour"].value_counts().index
explode=(0, 0, 0, 0.3)
ax[0,0].pie(data, explode=explode, autopct='%1.2f%%', pctdistance=1.2, startangle=0,textprops={'fontsize':'xx-large'})
ax[0,0].set_title('Land Contour', fontsize='xx-large', y=0.9);
ax[0,0].legend(labels=['Near Flat', 'Hillside', 'Banked', 'Depression'], loc=1, bbox_to_anchor=(1.28,1), fontsize='xx-large');
ax[0,0].text(0.1,0.05,r'(a)',transform=ax[0,0].transAxes, fontsize='xx-large')


##LotShape piechart
data=df_train["LotShape"].value_counts().values
labels=df_train["LotShape"].value_counts().index
explode=(0, 0, 0, 0.3)
ax[0,1].pie(data, explode=explode, autopct='%1.2f%%', pctdistance=1.2, startangle=210,textprops={'fontsize':'xx-large'})
ax[0,1].set_title('Lot Shape', fontsize='xx-large', y=0.9);
ax[0,1].legend(labels=['Regular', 'Slight\nirregular', 'Moderately\nirregular', 'Irregular'], loc=1, bbox_to_anchor=(1.35,1), fontsize='xx-large');
ax[0,1].text(0.1,0.05,r'(b)',transform=ax[0,1].transAxes, fontsize='xx-large')


##LotConfig piechart 
data=df_train["LotConfig"].value_counts().values
labels=df_train["LotConfig"].value_counts().index
explode=(0, 0, 0, 0.3, 0.4)
ax[1,0].pie(data, explode=explode, autopct='%1.2f%%', pctdistance=1.2, startangle=0,textprops={'fontsize':'xx-large'})
ax[1,0].set_title('Lot Configuration', fontsize='xx-large', y=0.9);
ax[1,0].legend(labels=['Inside', 'Corner', 'Cul-de-sac', '2-side\nfrontage', '3-side\nfrontage'], loc=1, bbox_to_anchor=(1.3,1.1), fontsize='xx-large');
ax[1,0].text(0.1,0.05,r'(c)',transform=ax[1,0].transAxes, fontsize='xx-large')


##Street piechart
data=df_train["Street"].value_counts().values
labels=df_train["Street"].value_counts().index
explode=(0, 0.05)
ax[1,1].pie(data, explode=explode, autopct='%1.2f%%', pctdistance=1.2, startangle=30,textprops={'fontsize':'xx-large'})
ax[1,1].set_title('Paved Street?', fontsize='xx-large', y=0.9);
ax[1,1].legend(labels=['Paved', 'Gravel'], loc=1, bbox_to_anchor=(1.23,1.1), fontsize='xx-large');
ax[1,1].text(0.1,0.05,r'(d)',transform=ax[1,1].transAxes, fontsize='xx-large');

#fig.savefig('B4.jpeg', dpi=1200, bbox_inches='tight')

<center style="font-size:15px;font-style:italics">
    <b><i>
    Figure 4: Pie chart of property features a. Land Contour b. Lot Shape c. Lot Configuration and d. Street Pavement
    </i></b>    
</center>

<p style="font-size:18px">
    It snows from Nov to Mar in Ames [3] and also, the tornado index is 337.09 [5]. This justifies the disproportionately high number of properties with Gable and Hip roofing style and shingled roofs (fig 5a, 5b) [6]. Furthermore, each and every house had heating system since this is an essential amenity in cold regions. Almost 98 percent of the houses were equipped with Gas forced warm air furnace.
</p>

In [21]:
##Note: While plotting multiple seaborn subplots its better to use sns.countplot(ax=ax) than ax = sns.countplot() 
#the latter will plot all the subplots and their annotations in the last subplot figure
fig, ax=plt.subplots(3,1,figsize=(12,13.5))

##Roof Style countplot
palette=sns.color_palette('Set2', n_colors=len(df_train['RoofStyle'].unique()))
handles=[Patch(color=i) for i in palette]
labels=['Gable','Hip','Gambrel','Mansard','Flat','Shed']
ax[0].tick_params(axis='both', which='both', bottom=False, left=False, labelbottom=False)
sns.countplot(data=df_train, y='RoofStyle', palette=palette,ax=ax[0]);
ax[0].set_xlim(0,2000)
ax[0].set_yticks(ax[0].get_yticks(), ax[0].get_yticklabels(), fontsize='large')
for i in ax[0].containers:
    ax[0].bar_label(i, fontsize='large', fontweight='bold');

ax[0].set_ylabel("Roofing Style", fontsize='large', fontstretch='semi-expanded', fontweight='bold');
ax[0].set_xlabel("Count", fontsize='large', fontweight='bold');
ax[0].legend(handles=handles, labels=labels, fontsize='large', bbox_to_anchor=(1,0.7))
ax[0].text(0,-0.05, r'(a)',transform=ax[0].transAxes, fontsize='large')


##Roofing Materials countplot
palette=sns.color_palette('Set2', n_colors=len(df_train['RoofMatl'].unique()))
handles=[Patch(color=i) for i in palette]
labels=['Standard (Composite) Shingle', 'Wood Shingles', 'Metal', 'Wood Shakes', 'Membrane', 'Tar & Gravel', 'Roll', 'Clay Tile']
ax[1].tick_params(axis='both', which='both', bottom=False, left=False, labelbottom=False)
sns.countplot(data=df_train, y='RoofMatl', palette=palette, ax=ax[1]);
ax[1].set_xlim(0,2000)
ax[1].set_yticks(ax[1].get_yticks(), ax[1].get_yticklabels(), fontsize='large')
for i in ax[1].containers:
    ax[1].bar_label(i, fontsize='large', fontweight='bold');

ax[1].set_ylabel("Roofing Material", fontsize='large', fontstretch='semi-expanded', fontweight='bold');
ax[1].set_xlabel("Count", fontsize='large', fontweight='bold');
ax[1].legend(handles=handles, labels=labels, fontsize='large', bbox_to_anchor=(1,0.8))
ax[1].text(0,-0.05, r'(b)',transform=ax[1].transAxes, fontsize='large')


##Heating countplot
palette=sns.color_palette('Set2',n_colors=len(df_train['Heating'].unique()))
handles=[Patch(color=i) for i in palette]
labels=['Gas forced warm air furnace', 'Gas hot water/ steam heat', 'Gravity furnace', 'Wall furnace', 'Hot water/ steam heat other than gas', 'Floor Furnace']
ax[2].tick_params(axis='both', which='both', bottom=False, left=False, labelbottom=False)
sns.countplot(data=df_train, y='Heating', palette=palette, ax=ax[2]);
ax[2].set_xlim(0,2000)
ax[2].set_yticks(ax[2].get_yticks(), ax[2].get_yticklabels(), fontsize='large')
for i in ax[2].containers:
    ax[2].bar_label(i, fontsize='large', fontweight='bold');

ax[2].set_ylabel("Heating Systems", fontsize='large', fontstretch='semi-expanded', fontweight='bold');
ax[2].set_xlabel("Count", fontsize='large', fontweight='bold');

ax[2].legend(handles=handles, labels=labels, fontsize='large', bbox_to_anchor=(1,0.7))
ax[2].text(0,-0.05, r'(c)',transform=ax[2].transAxes, fontsize='large');

#fig.savefig('B5.jpeg', dpi=1200, bbox_inches='tight')

<center style="font-size:15px;font-style:italics">
    <b><i>
    Figure 5: Count plot representing distribution of a. Roofing Style b. Roofing Material c. Heating System
    </i></b>    
</center>

<p style="font-size:18px">
    More than half of the houses had fireplaces (fig 6a). Almost 95 percent of them had a garage and more than 97 percent had a basement (fig 6b, 6c). The overall weather and temperature profile of Ames is more characteristic of a tundra than a tropical region. This has translated in less than half a percent of homes having a pool (fig 6d). All the properties except one were equipped with all public utilities (fig 6e). Less than 4 percent of the properties had additional amenities like shed, 2nd garage and tennis court (fig 6f).
</p>

In [22]:
fig, ax = plt.subplots(figsize=(8,12), ncols=2, nrows=3)


##Fireplace
data=df_train["HasFireplace"].value_counts().values
labels=df_train["HasFireplace"].value_counts().index
explode=(0, 0.05)
ax[0,0].pie(data, explode=explode, labels=labels, labeldistance=1, autopct='%1.2f%%', pctdistance=0.6, startangle=90,textprops={'fontsize':'x-large'})
ax[0,0].set_title('Has Fireplace?', fontsize='xx-large', y=0.9);
ax[0,0].text(0,0,r'(a)',transform=ax[0,0].transAxes)


##Garage
data=df_train["HasGarage"].value_counts().values
labels=df_train["HasGarage"].value_counts().index
explode=(0, 0.1)
ax[0,1].pie(data, explode=explode, labels=labels, labeldistance=1, autopct='%1.2f%%', pctdistance=0.6, textprops={'fontsize':'x-large'})
ax[0,1].set_title('Has Garage?', fontsize='xx-large', y=0.9);
ax[0,1].text(0,0,r'(b)',transform=ax[0,1].transAxes)


##Basement
data=df_train["HasBasement"].value_counts().values
labels=df_train["HasBasement"].value_counts().index
explode=(0, 0.1)
ax[1,0].pie(data, explode=explode, labels=labels, labeldistance=1, autopct='%1.2f%%', pctdistance=0.6, textprops={'fontsize':'x-large'})
ax[1,0].set_title('Has Basement?', fontsize='xx-large', y=0.9);
ax[1,0].text(0,0,r'(c)',transform=ax[1,0].transAxes)


##Pool
data=df_train["HasPool"].value_counts().values
labels=df_train["HasPool"].value_counts().index
explode=(0, 0.1)
ax[1,1].pie(data, explode=explode, labels=labels, labeldistance=1, autopct='%1.2f%%', pctdistance=0.6, textprops={'fontsize':'x-large'})
ax[1,1].set_title('Has Pool?', fontsize='xx-large', y=0.9);
ax[1,1].text(0,0,r'(d)',transform=ax[1,1].transAxes)


##Utilities
data=df_train["Utilities"].value_counts().values
labels=df_train["Utilities"].value_counts().index
explode=(0, 0.2)
ax[2,0].pie(data, explode=explode, autopct='%1.2f%%', pctdistance=0.6, startangle=0,textprops={'fontsize':'x-large'})
ax[2,0].set_title('Has Utilities?', fontsize='xx-large', y=0.9);
ax[2,0].legend(labels=['All Utilities', 'Elec and Gas'], loc=4, bbox_to_anchor=(1.2,0))
ax[2,0].text(0,0,r'(e)',transform=ax[2,0].transAxes)


##Misc Features
data=df_train["HasAmenities"].value_counts().values
labels=df_train["HasAmenities"].value_counts().index
explode=(0, 0.1)
ax[2,1].pie(data, labels=labels, labeldistance=1, explode=explode ,autopct='%1.2f%%', pctdistance=0.6, startangle=0,textprops={'fontsize':'x-large'})
ax[2,1].set_title('Has Amenities?', fontsize='xx-large', y=0.9);
ax[2,1].text(0,0,r'(f)',transform=ax[2,1].transAxes);

#fig.savefig('B6.jpeg', dpi=1200, bbox_inches='tight')

<center style="font-size:15px;font-style:italics">
    <b><i>
    Figure 6: Pie chart representing present/absence of a. Fireplace b. Garage c. Basement d. Pool e. Utilities f. Amenities
    </i></b>    
</center>

<p style="font-size:18px">
    Average selling price of the dataset was 180921.2 US Dollars with a standard deviation of 79442.5 US Dollars (fig 7a). The distribution of sales price feature was close to normal distribution and was skewed to right. The skewness was reflected in the Quantile – Quantile plot where empirical values vs theoretical quantile graph was concaved upward; it was flatter than ideal line before median value and it became steeper after that (fig 7b). The skewness of this distribution was 1.88. The most common technique to remove skewness is to use log transform of the given feature values and observe the distribution of log values. To avoid the occurrence of inf or highly negative log values, log1p transform was used (p  log(1+p)). The distribution of the log values was much closer to the normal distribution than the regular scale values (fig 7c). The QQ plot also reflect the near normal distribution with right skewness reduced to 0.12 (fig 7d).  More than a percent of houses had selling price 3 standard deviations above mean which also contributed to the right skewness.
</p>

In [23]:
gs_kw=dict(width_ratios=[1.5,1])
fig, ax = plt.subplot_mosaic([['ul','ur'],['ll','lr']],
                             figsize=(15,12), gridspec_kw=gs_kw)

##SalesPrice Distribution
yticks=[50,100,150]
xticks=[200000,400000,600000]
xticklabels=['200k', '400k', '600k']
text= f'Summary\n\u03BC : {np.round(SalePrice.mean(),2)} \n\u03C3 : {np.round(SalePrice.std(),2)}'
ax['ul'].tick_params(bottom=False, left=False)
sns.histplot(SalePrice, ax=ax['ul'], kde=True);
ax['ul'].set_xticks(ticks=xticks, labels=xticklabels, fontsize='x-large')
ax['ul'].set_yticks(ticks=yticks, labels=yticks, fontsize='x-large')
ax['ul'].set_xlabel('Sales Price', fontsize='x-large', fontweight='bold')
ax['ul'].set_ylabel('Sales Count', fontsize='x-large', fontweight='bold')
ax['ul'].text(0.7,0.8, text, transform=ax['ul'].transAxes, fontsize='x-large', fontweight='bold')
ax['ul'].text(-0.05,-0.05, r'(a)', transform=ax['ul'].transAxes, fontsize='x-large')



##SalePrice QQ plot
xticks=[-3,-2,-1,0,1,2,3]
yticks=[200000,400000,600000]
yticklabels=['200k','400k','600k']
text=f'Skewness : {np.round(stats.skew(SalePrice),2)}'
ax['ur'].tick_params(bottom=False, left=False)
stats.probplot(SalePrice, plot=ax['ur']);
ax['ur'].set_xticks(ticks=xticks, labels=xticks, fontsize='x-large')
ax['ur'].set_yticks(ticks=yticks, labels=yticklabels, fontsize='x-large');
ax['ur'].set_xlabel('Theoretical quantiles', fontsize='x-large', fontweight='bold')
ax['ur'].set_ylabel('Ordered Values', fontsize='x-large', fontweight='bold')
ax['ur'].text(0.1,0.9, text, transform=ax['ur'].transAxes, fontsize='x-large', fontweight='bold')
ax['ur'].set_title('')
ax['ur'].text(-0.05,-0.05, r'(b)', transform=ax['ur'].transAxes, fontsize='x-large')



##Log SalesPrice Distribution
xticks=[11.0,12.0,13.0]
yticks=[40,80,120]
text= f'Summary\n\u03BC : {np.round(np.log(SalePrice).mean(),2)} \n\u03C3 : {np.round(np.log(SalePrice).std(),2)}'
ax['ll'].tick_params(bottom=False, left=False)
sns.histplot(np.log(SalePrice), ax=ax['ll'], kde=True);
ax['ll'].set_xticks(ticks=xticks, labels=xticks, fontsize='x-large')
ax['ll'].set_yticks(ticks=yticks, labels=yticks, fontsize='x-large')
ax['ll'].set_xlabel('Sales Price (in log)', fontsize='x-large', fontweight='bold')
ax['ll'].set_ylabel('Sales Count', fontsize='x-large', fontweight='bold')
ax['ll'].text(0.75,0.8, text, transform=ax['ll'].transAxes, fontsize='x-large', fontweight='bold')
ax['ll'].text(-0.05,-0.05, r'(b)', transform=ax['ll'].transAxes, fontsize='x-large')



##Log SalePrice QQ plot
xticks=[-3,-2,-1,0,1,2,3]
yticks=[11.0,12.0,13.0]
text=f'Skewness : {np.round(stats.skew(np.log(SalePrice)),2)}'
ax['lr'].tick_params(bottom=False, left=False)
stats.probplot(np.log(SalePrice), plot=ax['lr']);
ax['lr'].set_xticks(ticks=xticks, labels=xticks, fontsize='x-large')
ax['lr'].set_yticks(ticks=yticks, labels=yticks, fontsize='x-large');
ax['lr'].set_xlabel('Theoretical quantiles', fontsize='x-large', fontweight='bold');
ax['lr'].set_ylabel('Ordered Values', fontsize='x-large', fontweight='bold');
ax['lr'].text(0.1,0.9, text, transform=ax['lr'].transAxes, fontsize='x-large', fontweight='bold');
ax['lr'].set_title('');
ax['lr'].text(-0.05,-0.05, r'(d)', transform=ax['lr'].transAxes, fontsize='x-large');

#fig.savefig('B7.jpeg', dpi=1200, bbox_inches='tight')

<center style="font-size:15px;font-style:italics">
    <b><i>
    Figure 7: a. Distribution of Sales Price b. Q - Q plot of distribution of Sales Price c. Distribution of Sales Price on log scale d. Q - Q plot of distribution of Sales Price on log scale
    </i></b>    
</center>

<p style="font-size:18px">
    If a numerical variable spans over a semi-infinite range, from 0 to ∞, it is more possible to have right skewness than left skewness. The analysis of other important numerical features like ground living area, lot size, basement area added evidence to this hypothesis. The ground living area, basement area had right skewness of 1.37 and 1.52 respectively (fig 8a, 8b, 8c, 8d). Since some of the houses did not had basement, the frequency count at 0 was high. Few houses had lot area greater than 40000 sq ft. These outliers spiked the right skewness of the lot size feature to 12.2 (fig 8d, 8f).
</p>

In [24]:
gs_kw=dict(width_ratios=[1.2,1])
fig, ax = plt.subplot_mosaic([['ul','ur'],['ml','mr'],['ll','lr']],
                             figsize=(15,18), gridspec_kw=gs_kw)

##Ground Living Area Distribution
data=df_train['GrLivArea']
xticks=[1000,2000,3000,4000,5000]
yticks=[40,80,120]
text= f'Summary\n\u03BC : {np.round(data.mean(),2)} \n\u03C3 : {np.round(data.std(),2)}'
ax['ul'].tick_params(bottom=False, left=False)
sns.histplot(data=data,ax=ax['ul'], kde=True);
ax['ul'].set_xticks(ticks=xticks, labels=xticks, fontsize='x-large')
ax['ul'].set_yticks(ticks=yticks, labels=yticks, fontsize='x-large')
ax['ul'].set_xlabel('Ground Living Area (in Sq Ft)', fontsize='x-large', fontweight='bold')
ax['ul'].set_ylabel('Sales Count', fontsize='x-large', fontweight='bold')
ax['ul'].text(0.7,0.8, text, transform=ax['ul'].transAxes, fontsize='x-large', fontweight='bold')
ax['ul'].text(-0.05,-0.05, r'(a)', transform=ax['ul'].transAxes, fontsize='x-large')



##Gross Living Area QQ plot
xticks=[-3,-2,-1,0,1,2,3]
yticks=[1000,2000,3000,4000,5000]
text=f'Skewness : {np.round(stats.skew(data),2)}'
ax['ur'].tick_params(bottom=False, left=False)
stats.probplot(data, plot=ax['ur']);
ax['ur'].set_xticks(ticks=xticks, labels=xticks, fontsize='x-large')
ax['ur'].set_yticks(ticks=yticks, labels=yticks, fontsize='x-large')
ax['ur'].set_xlabel('Theoretical quantiles', fontsize='x-large', fontweight='bold')
ax['ur'].set_ylabel('Ordered Values', fontsize='x-large', fontweight='bold')
ax['ur'].text(0.1,0.9, text, transform=ax['ur'].transAxes, fontsize='x-large', fontweight='bold')
ax['ur'].set_title('')
ax['ur'].text(-0.05,-0.05, r'(b)', transform=ax['ur'].transAxes, fontsize='x-large')



##Total Basement Area Distribution
data=df_train['TotalBsmtSF']
xticks=[0,1000,2000,3000,4000,5000,6000]
yticks=[40,80,120]
text= 'Summary\n\u03BC : {} \n\u03C3 : {}'.format(np.round(data.mean(),2), np.round(data.std(),2))
ax['ml'].tick_params(bottom=False, left=False)
sns.histplot(data=data,ax=ax['ml'], kde=True)
ax['ml'].set_xticks(ticks=xticks, labels=xticks, fontsize='x-large')
ax['ml'].set_yticks(ticks=yticks, labels=yticks, fontsize='x-large')
ax['ml'].set_yticks(ticks=yticks, labels=yticks)
ax['ml'].set_xlabel('Basement Area (in Sq Ft)', fontsize='x-large', fontweight='bold')
ax['ml'].set_ylabel('Sales Count', fontsize='x-large', fontweight='bold')
ax['ml'].text(0.7,0.8, text, transform=ax['ml'].transAxes, fontsize='x-large', fontweight='bold')
ax['ml'].text(-0.05,-0.05, r'(c)', transform=ax['ml'].transAxes, fontsize='x-large')



##Total Basement Area QQ plot
xticks=[-3,-2,-1,0,1,2,3]
yticks=[0,1000,2000,3000,4000,5000,6000]
text='Skewness : {}'.format(np.round(stats.skew(data),2))
ax['mr'].tick_params(bottom=False, left=False)
stats.probplot(data, plot=ax['mr'])
ax['mr'].set_xticks(ticks=xticks, labels=xticks, fontsize='x-large')
ax['mr'].set_yticks(ticks=yticks, labels=yticks, fontsize='x-large')
ax['mr'].set_xlabel('Theoretical quantiles', fontsize='x-large', fontweight='bold')
ax['mr'].set_ylabel('Ordered Values', fontsize='x-large', fontweight='bold')
ax['mr'].text(0.1,0.9, text, transform=ax['mr'].transAxes, fontsize='x-large', fontweight='bold')
ax['mr'].set_title('')
ax['mr'].text(-0.05,-0.05, r'(d)', transform=ax['mr'].transAxes, fontsize='x-large')



##Lot Area Distribution
data=df_train['LotArea']
xticks=[0,10000,30000,50000,100000,150000,200000]
xticklabels=['0','10k','30k','50k','100k','150k','200k']
yticks=[40,80,120,160]
text= 'Summary\n\u03BC : {} \n\u03C3 : {}'.format(np.round(data.mean(),2), np.round(data.std(),2))
ax['ll'].tick_params(bottom=False, left=False)
sns.histplot(data=data,ax=ax['ll'], kde=True);
ax['ll'].set_xticks(ticks=xticks, labels=xticklabels, fontsize='x-large')
ax['ll'].set_yticks(ticks=yticks, labels=yticks, fontsize='x-large')
ax['ll'].set_xlabel('Lot Size (in Sq Ft)', fontsize='x-large', fontweight='bold')
ax['ll'].set_ylabel('Sales Count', fontsize='x-large', fontweight='bold')
ax['ll'].text(0.7,0.8, text, transform=ax['ll'].transAxes, fontsize='x-large', fontweight='bold')
ax['ll'].text(-0.05,-0.05, r'(e)', transform=ax['ll'].transAxes, fontsize='x-large');



##Lot Area QQ plot
xticks=[-3,-2,-1,0,1,2,3]
yticks=[0,10000,30000,50000,100000,150000,200000]
yticklabels=['0','10k','30k','50k','100k','150k','200k']
text='Skewness : {}'.format(np.round(stats.skew(data),2))
ax['lr'].tick_params(bottom=False, left=False)
stats.probplot(data, plot=ax['lr'])
ax['lr'].set_xticks(ticks=xticks, labels=xticks, fontsize='x-large')
ax['lr'].set_yticks(ticks=yticks, labels=yticklabels, fontsize='x-large')
ax['lr'].set_xlabel('Theoretical quantiles', fontsize='x-large', fontweight='bold')
ax['lr'].set_ylabel('Ordered Values', fontsize='x-large', fontweight='bold')
ax['lr'].text(0.1,0.9, text, transform=ax['lr'].transAxes, fontsize='x-large', fontweight='bold')
ax['lr'].set_title('')
ax['lr'].text(-0.05,-0.05, r'(f)', transform=ax['lr'].transAxes, fontsize='x-large');

#fig.savefig('B8.jpeg', dpi=1200, bbox_inches='tight')

<center style="font-size:15px;font-style:italics">
    <b><i>
    Figure 8 : a. Distribution of Ground Living Area b. Q - Q plot of distribution of Ground Living Area c. Distribution of Basement Area d. Q - Q plot of distribution of Basement Area e. Distribution of Lot Size f. Q -Q plot of distribution of Lot Size
    </i></b>    
</center>

<p style="font-size:18px">
    Log1p transform made the ground living area distribution almost normal with small left skewness of 0.01 (fig 9a, 9b). However, the same transform brought about a huge shift from slight right skewness on normal scale to large left skewness on logscale (fig 9c, 9d). Careful inspection of the Q – Q plot of the basement area (normal scale) shows that the change of basement area from no-basement property to a property with basement is not smooth and gradual but rather abrupt and steep since no - basement property had zero basement area and most of the properties with basement had basement area similar to ground living area (which is quite obvious given ground being built on top of basement). Hence, in figure 8c, we see a tall bar at zero followed by really short bars eventually followed by taller ones. In log1p transform all the datapoints with zero basement area are clustered to zero and other data points form a near gaussian distribution centered around e6.75. This caused left skewness of 5.15 in the distribution. The lot size distribution had a slight left skewness of 0.14 after log1p transform (fig 9e, 9f). This is significant improvement.
</p>

In [25]:
gs_kw=dict(width_ratios=[1.2,1])
fig, ax = plt.subplot_mosaic([['ul','ur'],['ml','mr'],['ll','lr']],
                             figsize=(15,18), gridspec_kw=gs_kw)

##Ground Living Area Distribution
data=np.log1p(df_train['GrLivArea'])
xticks=[6.0,6.5,7.0,7.5,8.0,8.5]
yticks=[40,80,120,160]
text= f'Summary\n\u03BC : {np.round(data.mean(),2)} \n\u03C3 : {np.round(data.std(),2)}'
ax['ul'].tick_params(bottom=False, left=False)
sns.histplot(data=data,ax=ax['ul'], kde=True);
ax['ul'].set_xticks(ticks=xticks, labels=xticks, fontsize='x-large')
ax['ul'].set_yticks(ticks=yticks, labels=yticks, fontsize='x-large')
ax['ul'].set_xlabel('Ground Living Area (in log Sq Ft)', fontsize='x-large', fontweight='bold')
ax['ul'].set_ylabel('Sales Count', fontsize='x-large', fontweight='bold')
ax['ul'].text(0.75,0.8, text, transform=ax['ul'].transAxes, fontsize='x-large', fontweight='bold')
ax['ul'].text(-0.05,-0.05, r'(a)', transform=ax['ul'].transAxes, fontsize='x-large')



##Gross Living Area QQ plot
xticks=[-3,-2,-1,0,1,2,3]
yticks=[6.0,6.5,7.0,7.5,8.0,8.5]
text=f'Skewness : {np.round(stats.skew(data),2)}'
ax['ur'].tick_params(bottom=False, left=False)
stats.probplot(data, plot=ax['ur']);
ax['ur'].set_xticks(ticks=xticks, labels=xticks, fontsize='x-large')
ax['ur'].set_yticks(ticks=yticks, labels=yticks, fontsize='x-large')
ax['ur'].set_xlabel('Theoretical quantiles', fontsize='x-large', fontweight='bold')
ax['ur'].set_ylabel('Ordered Values', fontsize='x-large', fontweight='bold')
ax['ur'].text(0.1,0.9, text, transform=ax['ur'].transAxes, fontsize='x-large', fontweight='bold')
ax['ur'].set_title('')
ax['ur'].text(-0.05,-0.05, r'(b)', transform=ax['ur'].transAxes, fontsize='x-large')



##Total Basement Area Distribution
data=np.log1p(df_train['TotalBsmtSF'])
xticks=[0,2,4,6,8]
yticks=[40,80,120,160]
text= 'Summary\n\u03BC : {} \n\u03C3 : {}'.format(np.round(data.mean(),2), np.round(data.std(),2))
ax['ml'].tick_params(bottom=False, left=False)
sns.histplot(data=data,ax=ax['ml'], kde=True)
ax['ml'].set_xticks(ticks=xticks, labels=xticks, fontsize='x-large')
ax['ml'].set_yticks(ticks=yticks, labels=yticks, fontsize='x-large')
ax['ml'].set_xlabel('Basement Area (in log Sq Ft)', fontsize='x-large', fontweight='bold')
ax['ml'].set_ylabel('Sales Count', fontsize='x-large', fontweight='bold')
ax['ml'].text(0.45,0.8, text, transform=ax['ml'].transAxes, fontsize='x-large', fontweight='bold')
ax['ml'].text(-0.05,-0.05, r'(c)', transform=ax['ml'].transAxes, fontsize='x-large')



##Total Basement Area QQ plot
xticks=[-3,-2,-1,0,1,2,3]
yticks=[0,2,4,6,8]
text='Skewness : {}'.format(np.round(stats.skew(data),2))
ax['mr'].tick_params(bottom=False, left=False)
stats.probplot(data, plot=ax['mr'])
ax['mr'].set_xticks(ticks=xticks, labels=xticks, fontsize='x-large')
ax['mr'].set_yticks(ticks=yticks, labels=yticks, fontsize='x-large')
ax['mr'].set_xlabel('Theoretical quantiles', fontsize='x-large', fontweight='bold')
ax['mr'].set_ylabel('Ordered Values', fontsize='x-large', fontweight='bold')
ax['mr'].text(0.1,0.9, text, transform=ax['mr'].transAxes, fontsize='x-large', fontweight='bold')
ax['mr'].set_title('')
ax['mr'].text(-0.05,-0.05, r'(d)', transform=ax['mr'].transAxes, fontsize='x-large')



##Lot Area Distribution
data=np.log1p(df_train['LotArea'])
xticks=[7,8,9,10,11,12]
yticks=[40,80,120]
text= 'Summary\n\u03BC : {} \n\u03C3 : {}'.format(np.round(data.mean(),2), np.round(data.std(),2))
ax['ll'].tick_params(bottom=False, left=False)
sns.histplot(data=data,ax=ax['ll'], kde=True)
ax['ll'].set_xticks(ticks=xticks, labels=xticks, fontsize='x-large')
ax['ll'].set_yticks(ticks=yticks, labels=yticks, fontsize='x-large')
ax['ll'].set_yticks(ticks=yticks, labels=yticks)
ax['ll'].set_xlabel('Lot Size (in log Sq Ft)', fontsize='x-large', fontweight='bold')
ax['ll'].set_ylabel('Sales Count', fontsize='x-large', fontweight='bold')
ax['ll'].text(0.75,0.8, text, transform=ax['ll'].transAxes, fontsize='x-large', fontweight='bold')
ax['ll'].text(-0.05,-0.05, r'(e)', transform=ax['ll'].transAxes, fontsize='x-large')



##Lot Area QQ plot
xticks=[-3,-2,-1,0,1,2,3]
yticks=[7,8,9,10,11,12]
text='Skewness : {}'.format(np.round(stats.skew(data),2))
ax['lr'].tick_params(bottom=False, left=False)
stats.probplot(data, plot=ax['lr'])
ax['lr'].set_xticks(ticks=xticks, labels=xticks, fontsize='x-large')
ax['lr'].set_yticks(ticks=yticks, labels=yticks, fontsize='x-large')
ax['lr'].set_xlabel('Theoretical quantiles', fontsize='x-large', fontweight='bold')
ax['lr'].set_ylabel('Ordered Values', fontsize='x-large', fontweight='bold')
ax['lr'].text(0.1,0.9, text, transform=ax['lr'].transAxes, fontsize='x-large', fontweight='bold')
ax['lr'].set_title('')
ax['lr'].text(-0.05,-0.05, r'(f)', transform=ax['lr'].transAxes, fontsize='x-large')

plt.show()
#fig.savefig('B9.jpeg', dpi=1200, bbox_inches='tight')

<center style="font-size:15px;font-style:italics">
    <b><i>
    Figure 9 : a. Distribution of Ground Living Area on log scale b. Q - Q plot of distribution of Ground Living Area on log scale c. Distribution of Basement Area d. Q - Q plot of distribution of Basement Area on log scale e. Distribution of Lot Size on log scale f. Q -Q plot of distribution of Lot Size on log scale
    </i></b>    
</center>

<p style="font-size:18px">
    Furthermore, some of the features, believed to be impacting sales, were studied. The regression plot of Sales Price vs Ground Living Area reveals partial monotonic behavior where the houses with larger 
ground living area are more likely to be expensive than houses with smaller ground living area (fig 10a). The solid blue straight line represents the linear regression model that fits the data with light blue shaded region being the 95 percent confidence interval (CI). Since the spread of the data increases as we move in positive direction along both the variable axes, the confidence interval also gets wider simultaneously. When the same two features were plotted on log1p scale the extent of linearity increased considerably. Few datapoints had abnormally low selling price compared to datapoints with similar ground living area (fig 10b). Except for those points almost all the datapoints were clustered around the linear regression model. The CI of the regression model was also consistently narrow.
Another such feature, lot size was plotted against the sales price. Due to few irregularly shaped lots with exceptionally large size the linear regression model failed to accommodate the underlying data (fig 10c). Since these large lots had really low sales prices the CI of the regression model widened sharply in the positive direction of both axes. Log1p transform of the two did help in bringing more monotonous and linear relation between the variable but still the spread of datapoints was not consistent (fig 10d). Removing the datapoints with abnormal lot sizes and repeating the line fitting might reveal different results.
</p>

In [None]:
fig, ax=plt.subplots(figsize=(16,16), ncols=2, nrows=2)


##Sales Price vs Ground Living Area relation
xticks=[1000,2000,3000,4000,5000]
yticks=[100000,300000,500000,700000]
yticklabels=['100k','300k','500k','700k']
ax[0,0].tick_params(bottom=False, left=False)
sns.regplot(data=df_train, x='GrLivArea', y='SalePrice', ax=ax[0,0])
ax[0,0].set_xticks(ticks=xticks, labels=xticks, fontsize='x-large')
ax[0,0].set_yticks(ticks=yticks, labels=yticklabels, fontsize='x-large')
ax[0,0].set_xlabel('Ground Living Area (in Sq Ft)', fontsize='x-large', fontweight='bold')
ax[0,0].set_ylabel('Sales Price', fontsize='x-large', fontweight='bold')

##Sales Price vs Ground Living Area relation in log scale
xticks=[6.0,6.5,7.0,7.5,8.0,8.5]
yticks=[11,12,13]
data=pd.DataFrame()
data['GrLivArea']=np.log1p(df_train['GrLivArea'])
data['SalePrice']=np.log1p(df_train['SalePrice'])
ax[0,1].tick_params(bottom=False, left=False)
sns.regplot(data=data, x='GrLivArea', y='SalePrice', ax=ax[0,1])
ax[0,1].set_xticks(ticks=xticks, labels=xticks, fontsize='x-large')
ax[0,1].set_yticks(ticks=yticks, labels=yticks, fontsize='x-large')
ax[0,1].set_xlabel('Ground Living Area (in log Sq Ft)', fontsize='x-large', fontweight='bold')
ax[0,1].set_ylabel('Sales Price (in logscale)', fontsize='x-large', fontweight='bold')



##Sales Price vs Lot Size relation
xticks=[5000,50000,100000,150000,200000]
yticks=[200000,400000,600000,800000, 1000000]
xticklabels=['5k','50k','100k','150k','200k']
yticklabels=['200k','400k','600k','800k', '1M']
ax[1,0].tick_params(bottom=False, left=False)
sns.regplot(data=df_train, x='LotArea', y='SalePrice', ax=ax[1,0])
ax[1,0].set_xticks(ticks=xticks, labels=xticklabels, fontsize='x-large')
ax[1,0].set_yticks(ticks=yticks, labels=yticklabels, fontsize='x-large')
ax[1,0].set_xlabel('Lot Size (in Sq Ft)', fontsize='x-large', fontweight='bold')
ax[1,0].set_ylabel('Sales Price', fontsize='x-large', fontweight='bold')


##Sales Price vs Lot Size relation in log scale
xticks=[7,8,9,10,11,12]
yticks=[11,12,13]
data=pd.DataFrame()
data['LotArea']=np.log1p(df_train['LotArea'])
data['SalePrice']=np.log1p(df_train['SalePrice'])
ax[1,1].tick_params(bottom=False, left=False)
sns.regplot(data=data, x='LotArea', y='SalePrice', ax=ax[1,1])
ax[1,1].set_xticks(ticks=xticks, labels=xticks, fontsize='x-large')
ax[1,1].set_yticks(ticks=yticks, labels=yticks, fontsize='x-large')
ax[1,1].set_xlabel('Lot Size (in log Sq Ft)', fontsize='x-large', fontweight='bold')
ax[1,1].set_ylabel('Sales Price (in logscale)', fontsize='x-large', fontweight='bold')

plt.show()

#fig.savefig('B10.jpeg', dpi=1200, bbox_inches='tight')


<center style="font-size:15px;font-style:italics">
    <b><i>
    Figure 10: a. Regression plot between Sales Price and Ground Living Area b. Regression plot between log of Sale Price and log Ground Living Area c. Regression plot between Sales Price and Lot Size d. Regression plot between log of Sale Price and log Lot Size
    </i></b>    
</center>

<p style="font-size:18px">
    The regression plot between sales price and ground living area was further decomposed by neighborhoods (fig 11). In almost all the neighborhood datapoints were clustered around a small range of sales price and ground living area and rendered linear behavior. However, the extent of linearity and gradient of the regression line varied across the neighborhood. In Brookside, College Creek, Meadow Village, South & West of Iowa State University the Sales Price was more sensitive to the ground living area than in other neighborhoods. For all the neighborhoods the gradient of regression line was almost similar when fitted on log1p scale (fig 12). The CI of all the regression line improved for majority of the neighborhoods except the places where sales prices and ground living 2area were very low.
</p>    

In [None]:
data=df_train
fig=sns.lmplot(data=data, x='GrLivArea', y='SalePrice', col='Neighborhood', col_wrap=5, height=3, aspect=0.9)
titles=['Bloomington \nHeights',
        'Bluestem', 
        'Briardale',
        'Brookside',
        'Clear \nCreek',
        'College \nCreek',
        'Crawford',
        'Edwards',
        'Gilbert',
        'IOWA DOT & \nRail Road',
        'Meadow \nVillage',
        'Mitchell',
        'North \nAmes',
        'Northridge',
        'Northpark \nVilla',
        'Northridge \nHeights',
        'Northwest \nAmes',
        'Old Town',
        'South&West\nof ISU',
        'Sawyer',
        'Sawyer \nWest',
        'Somerset',
        'Stone \nBrook',
        'Timberland',
        'Veenker']
xticks=[1000, 3000, 5000]
xticklabels=['1k','3k','5k']
yticks=[200000,400000,600000]
yticklabels=['200k','400k','600k']
for i in fig.axes:
    i.tick_params(axis='both', which='both', bottom=False, left=False)
    i.set_xlabel("Ground Living \nArea (in Sq Ft)", fontsize='x-large', fontstretch='semi-expanded', fontweight='bold')
    i.set_xticks(ticks=xticks, labels=xticklabels, fontsize='xx-large')
    i.set_ylim(0,800000)
    i.set_yticks(ticks=yticks, labels=yticklabels, fontsize='xx-large')
    i.set_ylabel("Sales Price", fontsize='large', fontweight='bold')
    i.set_title(titles[list(fig.axes).index(i)],fontsize='large', fontweight='bold', x=0.4, y=0.8)

plt.show()
#fig.savefig('B11.jpeg', dpi=1200, bbox_inches='tight')

<center style="font-size:15px;font-style:italics">
    <b><i>
    Figure 11 : Regression plot between Sales Price and Ground Living Area decomposed by Neighbourhoods
    </i></b>    
</center>

In [None]:
data=pd.DataFrame()
data['GrLivArea']=np.log1p(df_train['GrLivArea'])
data['SalePrice']=np.log1p(df_train['SalePrice'])
data['Neighborhood']=df_train['Neighborhood']
fig=sns.lmplot(data=data, x='GrLivArea', y='SalePrice', col='Neighborhood', col_wrap=5, height=3, aspect=0.9)
titles=['Bloomington \nHeights',
        'Bluestem', 
        'Briardale',
        'Brookside',
        'Clear \nCreek',
        'College \nCreek',
        'Crawford',
        'Edwards',
        'Gilbert',
        'IOWA DOT & \nRail Road',
        'Meadow \nVillage',
        'Mitchell',
        'North \nAmes',
        'Northridge',
        'Northpark \nVilla',
        'Northridge \nHeights',
        'Northwest \nAmes',
        'Old Town',
        'South&West\nof ISU',
        'Sawyer',
        'Sawyer \nWest',
        'Somerset',
        'Stone \nBrook',
        'Timberland',
        'Veenker']
xticks=[6, 7, 8]
yticks=[11,12,13]
for i in fig.axes:
    i.tick_params(axis='both', which='both', bottom=False, left=False)
    i.set_xlabel("Ground Living \nArea (in log Sq Ft)", fontsize='x-large', fontstretch='semi-expanded', fontweight='bold')
    i.set_xticks(ticks=xticks, labels=xticks, fontsize='xx-large')
    i.set_yticks(ticks=yticks, labels=yticks, fontsize='xx-large')
    i.set_ylabel("Sales Price (in logscale)", fontsize='large', fontweight='bold')
    i.set_title(titles[list(fig.axes).index(i)],fontsize='x-large', fontweight='bold', x=0.4, y=0.8)

plt.show()
#fig.savefig('B12.jpeg', dpi=1200, bbox_inches='tight')

<center style="font-size:15px;font-style:italics">
    <b><i>
    Figure 12 : Regression plot between log of Sales Price and log of Ground Living Area decomposed by Neighbourhoods
    </i></b>    
</center>