In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.ticker as mtick
plt.style.use('ggplot')
import seaborn as sns
pd.options.display.float_format = '{:,.0f}'.format
import dataframe_image as dfi
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
from statsmodels.stats.stattools import durbin_watson
import statsmodels.api as sm
from statsmodels.graphics.gofplots import qqplot

In [3]:
Ames = pd.read_csv('Ames_HousePrice.csv')

In [4]:
Ames.head() #1st 5 rows

Unnamed: 0.1,Unnamed: 0,PID,GrLivArea,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,909176150,856,126000,30,RL,,7890,Pave,,...,166,0,,,,0,3,2010,WD,Normal
1,2,905476230,1049,139500,120,RL,42.0,4235,Pave,,...,0,0,,,,0,2,2009,WD,Normal
2,3,911128020,1001,124900,30,C (all),60.0,6060,Pave,,...,0,0,,,,0,11,2007,WD,Normal
3,4,535377150,1039,114000,70,RL,80.0,8146,Pave,,...,111,0,,,,0,5,2009,WD,Normal
4,5,534177230,1665,227000,60,RL,70.0,8400,Pave,,...,0,0,,,,0,11,2009,WD,Normal


In [5]:
#drop extra unnamed column
Ames.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
Ames_Data_Types = pd.DataFrame({'Column': Ames.columns, 'Data Type': Ames.dtypes.values})
#Ames_Data_Types.to_csv('Ames_Data_Types.csv', index=False)  #Export, only needed once

In [7]:
#initial inspection
print('Column List\n', Ames.columns) #column list
print('\nStatistical Snapshot:\n', Ames.describe()) #statistical snapshot
print('\nColumns with Missing Values:\n', Ames.isnull().sum()[Ames.isnull().sum() != 0]) #sum of missing values by column name
print('\nRows with Missing Values:\n', Ames[Ames.isnull().any(axis=1)]) #show rows with missing values
print('\nDuplicate Rows:\n', Ames[Ames.duplicated(keep=False)]) #duplicate rows

Column List
 Index(['PID', 'GrLivArea', 'SalePrice', 'MSSubClass', 'MSZoning',
       'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
       'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond',
       'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF',
       '2ndFlrSF', 'LowQualFinSF', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
     

In [8]:
Ames.drop_duplicates(keep = 'first', inplace=True)
print('\nDuplicate Rows:\n', Ames[Ames.duplicated(keep=False)]) #duplicate rows


Duplicate Rows:
 Empty DataFrame
Columns: [PID, GrLivArea, SalePrice, MSSubClass, MSZoning, LotFrontage, LotArea, Street, Alley, LotShape, LandContour, Utilities, LotConfig, LandSlope, Neighborhood, Condition1, Condition2, BldgType, HouseStyle, OverallQual, OverallCond, YearBuilt, YearRemodAdd, RoofStyle, RoofMatl, Exterior1st, Exterior2nd, MasVnrType, MasVnrArea, ExterQual, ExterCond, Foundation, BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinSF1, BsmtFinType2, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, Heating, HeatingQC, CentralAir, Electrical, 1stFlrSF, 2ndFlrSF, LowQualFinSF, BsmtFullBath, BsmtHalfBath, FullBath, HalfBath, BedroomAbvGr, KitchenAbvGr, KitchenQual, TotRmsAbvGrd, Functional, Fireplaces, FireplaceQu, GarageType, GarageYrBlt, GarageFinish, GarageCars, GarageArea, GarageQual, GarageCond, PavedDrive, WoodDeckSF, OpenPorchSF, EnclosedPorch, 3SsnPorch, ScreenPorch, PoolArea, PoolQC, Fence, MiscFeature, MiscVal, MoSold, YrSold, SaleType, SaleCondition]
Index: []

[0 rows 

In [9]:
Ames_Missing_Values = (
    Ames.isnull().sum()[Ames.isnull().sum() != 0]
    .to_frame('MissingCount')
    .assign(MissingPercent=lambda x: 100 * x['MissingCount'] / len(Ames))
    .reset_index()
    .rename(columns={'index': 'ColumnName'})
)
Ames_Missing_Values
Ames_Missing_Columns = Ames_Missing_Values['ColumnName']

In [10]:
Ames_Missing_Columns = Ames[Ames_Missing_Values['ColumnName']]
Ames_Missing_Columns_Numeric = Ames_Missing_Columns.select_dtypes(include=['number'])
Ames_Missing_Columns_Categorical = Ames_Missing_Columns.select_dtypes(include=['object', 'category'])
for col in Ames_Missing_Columns_Categorical.columns:
    print(f'\nValue counts for {col}:')
    print(Ames_Missing_Columns_Categorical[col].value_counts(dropna=False))


Value counts for Alley:
Alley
NaN     2411
Grvl     105
Pave      63
Name: count, dtype: int64

Value counts for MasVnrType:
MasVnrType
NaN        1572
BrkFace     804
Stone       183
BrkCmn       20
Name: count, dtype: int64

Value counts for BsmtQual:
BsmtQual
TA     1166
Gd     1076
Ex      188
Fa       78
NaN      69
Po        2
Name: count, dtype: int64

Value counts for BsmtCond:
BsmtCond
TA     2315
Gd      101
Fa       88
NaN      69
Ex        3
Po        3
Name: count, dtype: int64

Value counts for BsmtExposure:
BsmtExposure
No     1708
Av      344
Gd      242
Mn      214
NaN      71
Name: count, dtype: int64

Value counts for BsmtFinType1:
BsmtFinType1
GLQ    753
Unf    706
ALQ    397
Rec    265
BLQ    250
LwQ    139
NaN     69
Name: count, dtype: int64

Value counts for BsmtFinType2:
BsmtFinType2
Unf    2179
Rec      97
LwQ      84
NaN      70
BLQ      64
ALQ      53
GLQ      32
Name: count, dtype: int64

Value counts for Electrical:
Electrical
SBrkr    2364
FuseA     168


In [11]:
Ames_Missing_Columns_Numeric.head()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,BsmtFullBath,BsmtHalfBath,GarageYrBlt,GarageCars,GarageArea
0,,0,238,0,618,856,1,0,1939,2,399
1,42.0,149,552,393,104,1049,1,0,1984,1,266
2,60.0,0,737,0,100,837,0,0,1930,1,216
3,80.0,0,0,0,405,405,0,0,1940,1,281
4,70.0,0,643,0,167,810,1,0,2001,2,528


In [18]:
#Pivot Tables for Round 1 Imputations
Zoning_Shape_Pivot = Ames.pivot_table(index=['MSZoning', 'LotShape'], aggfunc={'LotFrontage': 'mean', 'LotArea': 'mean'})
OverallQual_Cond_Pivot = Ames.pivot_table(index=['OverallQual', 'OverallCond'],
    aggfunc={
        'Electrical': lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan,
        'MasVnrType': lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan})
Zoning_Shape_Pivot.rename(columns={'LotFrontage': 'LotFrontage_Mean', 'LotArea': 'LotArea_Mean'}, inplace=True)
OverallQual_Cond_Pivot.rename(columns={'Electrical': 'Electrical_Mode', 'MasVnrType': 'MasVnrType_Mode'}, inplace=True)

In [None]:
#Impute LotFrontage
missingRowIndexes = Ames['LotFrontage'].isnull()
Ames.loc[missingRowIndexes, 'LotFrontage'] = Ames.loc[missingRowIndexes].apply(
    lambda row: (
        Zoning_Shape_Pivot.loc[(row['MSZoning'], row['LotShape']), 'LotFrontage_Mean']
        if (row['MSZoning'], row['LotShape']) in Zoning_Shape_Pivot.index
        else Ames['LotFrontage'].dropna().mode().iloc[0]
    ),
    axis=1
)

In [None]:
#Custom Histogram KDE Function
def custom_kde(frame_name, col_name):
    
    # Style setup
    sns.set_style('whitegrid')

    # Calculate Mean, Median, Mode for SalePrice
    mean_col = frame_name[col_name].mean()
    median_col = frame_name[col_name].median()
    mode_col = frame_name[col_name].mode().values[0]
     
    # Plotting histogram
    plt.figure(figsize=(14, 7))
    sns.histplot(x=frame_name[col_name], bins=100, kde=True, color='skyblue')
    plt.axvline(mean_col, color='r', linestyle='--', label=f'Mean: {mean_col:,.0f}')
    plt.axvline(median_col, color='g', linestyle='-', label=f'Median: {median_col:,.0f}')
    plt.axvline(mode_col, color='b', linestyle='-.', label=f'Mode: {mode_col:,.0f}')
     
    # Annotations for skewness and kurtosis
    plt.annotate('Skewness: {:.2f}\nKurtosis: {:.2f}'.format(frame_name[col_name].skew(), frame_name[col_name].kurt()),
                 xy=(500000, 100), fontsize=14, bbox=dict(boxstyle='round,pad=0.3', edgecolor='black', facecolor='aliceblue'))
     
    plt.title(col_name)
    plt.legend()
    plt.xlabel(col_name)
    plt.ylabel('Frequency')
    plt.gca().xaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: '{:,.0f}'.format(x)))
    plt.gca().yaxis.set_major_formatter(mtick.FuncFormatter(lambda y, _: '{:,.0f}'.format(y)))
    plt.show()

In [None]:
custom_kde(Ames, 'GrLivArea')

In [None]:
custom_kde(Ames, 'OverallCond')

In [None]:
custom_kde(Ames, 'OverallQual')

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(15, 4))

scat1 = sns.regplot(x='GrLivArea', y='SalePrice', data=Ames, ax=ax[0])
scat1.xaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: '{:,.0f}'.format(x)))
scat1.yaxis.set_major_formatter(mtick.FuncFormatter(lambda y, _: '{:,.0f}'.format(y)))
ax[0].set_title('Price vs Living Area')
ax[0].set_xlabel('Area')
ax[0].set_ylabel('Price')
 
scat2 = sns.regplot(x='OverallCond', y='SalePrice', data=Ames, ax=ax[1])
scat2.xaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: '{:,.0f}'.format(x)))
scat2.yaxis.set_major_formatter(mtick.FuncFormatter(lambda y, _: '{:,.0f}'.format(y)))
ax[1].set_title('Price vs Overall Condition')
ax[1].set_xlabel('Overall Condition')
ax[1].set_ylabel('Price')

scat3 = sns.regplot(x='OverallQual', y='SalePrice', data=Ames, ax=ax[2])
scat3.xaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: '{:,.0f}'.format(x)))
scat3.yaxis.set_major_formatter(mtick.FuncFormatter(lambda y, _: '{:,.0f}'.format(y)))
ax[2].set_title('Price vs Overall Quality')
ax[2].set_xlabel('Overall Quality')
ax[2].set_ylabel('Price')

In [None]:
Y = Ames['SalePrice']
X = Ames[['GrLivArea']]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
lm.fit(X_train, Y_train)
print('R^2 of the train set: %.4f' % (lm.score(X_train, Y_train)))
print('R^2 of the test set: %.4f' % (lm.score(X_test, Y_test)))
print('Intercept:', lm.intercept_)
print('Coefficient:', lm.coef_)

In [None]:
X = Ames[['GrLivArea', 'OverallQual', 'OverallCond']]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
lm.fit(X_train, Y_train)
print('R^2 of the train set: %.4f' % (lm.score(X_train, Y_train)))
print('R^2 of the test set: %.4f' % (lm.score(X_test, Y_test)))
print('Intercept:', lm.intercept_)
print('Coefficient:', lm.coef_)

In [None]:
X = Ames[['GrLivArea', 'OverallQual']]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
lm.fit(X_train, Y_train)
print('R^2 of the train set: %.4f' % (lm.score(X_train, Y_train)))
print('R^2 of the test set: %.4f' % (lm.score(X_test, Y_test)))
print('Intercept:', lm.intercept_)
print('Coefficient:', lm.coef_)