In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

In [None]:
df = pd.read_csv('data/train.csv')

In [None]:
df.head()

## Target

In [None]:
df['SalePrice'] = np.log(df['SalePrice'])

## Land Property
LotFrontage, LotArea, LotShape, LandContour, LandSlope

In [None]:
land_features = df[['LotFrontage', 'LotArea', 'LotShape', 'LandContour', 'LandSlope']]
land_features

### LotFrontage

In [None]:
df['LotFrontage'] = df['LotFrontage'].fillna(value = df['LotFrontage'].median())

In [None]:
df['LotFrontage'].isna().sum()

In [None]:
data = land_features.copy()

In [None]:
df['LotFrontage'] = np.log(df['LotFrontage'])

In [None]:
corr, p_val = stats.pearsonr(df['LotFrontage'], df['SalePrice'])
print("corr : ", corr, "\np_val : ", p_val)

In [None]:
p_val < 0.05

### LotArea

In [None]:
df['LotArea'] = np.log(df['LotArea'])

In [None]:
df[['LotArea', 'LotFrontage', 'SalePrice']].corr()

In [None]:
corr, p_val = stats.pearsonr(df['LotArea'], df['SalePrice'])
print("corr : ", corr, "\np_val : ", p_val)

In [None]:
p_val < 0.05

### Checking Corr between independent features

In [None]:
df[['LotArea', 'LotFrontage']].corr()

Drop Lot Frontage

In [None]:
df['LotShape'].unique()

In [None]:
g1 = df[df['LotShape'] == 'Reg']['SalePrice']
g2 = df[df['LotShape'] == 'IR1']['SalePrice']
g3 = df[df['LotShape'] == 'IR2']['SalePrice']
g4 = df[df['LotShape'] == 'IR3']['SalePrice']

In [None]:
f_stat, p_val = stats.f_oneway(g1, g2, g3, g4)
print(f_stat, p_val)

In [None]:
p_val < 0.05

In [None]:
df['LandSlope'].unique()

In [None]:
g1 = df[df['LandSlope'] == 'Gtl']['LotArea']
g2 = df[df['LandSlope'] == 'Mod']['LotArea']
g3 = df[df['LandSlope'] == 'Sev']['LotArea']

In [None]:
f_stat, p_val = stats.f_oneway(g1, g2, g3, g4)
print(f_stat, p_val)

# Basement Features


In [None]:
basement_features = df[['BsmtQual',
'BsmtCond',
'BsmtExposure',
'BsmtFinType1',
'BsmtFinSF1',
'BsmtFinType2',
'BsmtFinSF2',
'BsmtUnfSF',
'TotalBsmtSF',
'BsmtFullBath',
'BsmtHalfBath',]]

In [None]:
basement_features

In [None]:
data = basement_features.copy()

In [None]:
plt.hist(df['BsmtFinSF1'], bins = 40)
plt.show()

In [None]:
corr, p_val = stats.pearsonr(df['BsmtFinSF1'], df['TotalBsmtSF'])
print(corr, p_val)

In [None]:
p_val < 0.05

In [None]:
df['BsmtFullBath'].unique()

In [None]:
df['BsmtQual'] = df['BsmtQual'].fillna(value = 'No Basement')
df['BsmtCond'] = df['BsmtCond'].fillna(value = 'No Basement')
df['BsmtExposure'] = df['BsmtExposure'].fillna(value = 'No Basement')
df['BsmtFinType2'] = df['BsmtFinType2'].fillna(value = 'No Basement')


In [None]:

def anova_groups(d_feature, c_feature, ignore = ''):
    groups = []
    for cat in df[d_feature].unique():
        if cat != ignore:
            print(cat)
            df[df[d_feature] == cat][c_feature]
            groups.append(df[df[d_feature] == cat][c_feature])

    return groups
    
    

In [None]:
groups = anova_groups(d_feature = 'KitchenQual', c_feature = 'SalePrice')

In [None]:
f_stat, p_val = stats.f_oneway(groups[0], groups[1], groups[2])
print(f_stat, p_val)

BasementQual
300.3923243191743 2.031281939790327e-188

BsmtCond
34.504086533842255 1.4927541408939521e-27

BsmtExposure
60.49458686046996 2.590707192162915e-47

BsmtHalfBath
0.030936149052788492 0.9695381138187172

BsmtFinType2
29.61917200515036 2.6062698303303355e-13

BsmtFullBath
29.595109471751318 1.4248324941269371e-18

KitchenQual
569.1052396037583 3.568211071739858e-182



In [None]:
observed = pd.crosstab(
    df[df['BsmtQual'] != 'No Basement']['BsmtQual'], 
    df[df['BsmtExposure'] != 'No Basement' ]['BsmtExposure']
)
observed

In [None]:
chi2_stats, p_val, dof, expected = stats.chi2_contingency(observed)

In [None]:
expected < 5

In [None]:
p_val

# Building Property

In [None]:
building_properties = df[[
    'BldgType',
    'RoofStyle',
    'RoofMatl',
    '1stFlrSF',
    '2ndFlrSF',
    'LowQualFinSF',
    'GrLivArea',
    'FullBath',
    'HalfBath',
    'BedroomAbvGr',
    'KitchenAbvGr',
    'KitchenQual',
    'Fence'
]]

In [None]:
building_properties.head()

In [None]:
df['BldgType'].unique()

In [None]:
groups = anova_groups(d_feature = 'BsmtCond', c_feature = 'SalePrice', ignore = '')

In [None]:
f_stat, p_val = stats.f_oneway(groups[0], groups[1], groups[2], groups[3], groups[4])
print(f_stat, p_val)

BldgType
15.211667391724271 3.4367936403015973e-12

BsmtCond
19.70813904568719 8.195793756122466e-16

Fence
16.10290232659194 6.560318639536422e-13

GrLivArea
0.7086244776126521 4.518033646779945e-223

1stFlrSF
0.6058521846919146 5.394710618971284e-147

2ndFlrSF
0.3193338028320678 5.764335119183061e-36

In [None]:
df['Fence'] = df['Fence'].fillna('No Fence')

In [None]:
feature = 'KitchenQual'
df.groupby(feature)['SalePrice'].median().plot.bar()
plt.xlabel(feature)
plt.ylabel('Sale Price')
plt.title(feature)
plt.show()

In [None]:
df['BsmtCond'].value_counts()

In [None]:
observed = pd.crosstab(df['RoofStyle'], df['RoofMatl'])
observed

In [None]:
df[(df['RoofStyle'] == 'Gable') & (df['RoofMatl'] == 'CompShg')]['SalePrice'].median()

In [None]:
df['RoofStyle'].unique()

In [None]:
df[(df['RoofStyle'] != 'Gable') & (df['RoofMatl'] != 'CompShg')]

In [None]:
chi2_stats, p_val, dof, expected = stats.chi2_contingency(observed)

In [None]:
expected < 5

In [None]:
p_val < 0.05

In [None]:
df[df['RoofMatl'] != 'CompShg'] = 'Others'
df[df['RoofStyle'] != 'Gable'] = 'Others'

In [None]:

df['RoofStyle'].value_counts()

In [None]:
df['BsmtCond'] = df['BsmtCond'].fillna(value = 'Unknown')

In [None]:
df['BsmtCond'].isna().sum()

In [None]:
plt.hist(df[df['2ndFlrSF'] > 50]['2ndFlrSF'], bins = 50)
plt.show()

In [None]:
corr, p_val = stats.pearsonr(df[df['2ndFlrSF'] > 50]['2ndFlrSF'], df[df['2ndFlrSF'] > 50]['SalePrice'])
print(corr, p_val)

In [None]:
df[['1stFlrSF', 'GrLivArea', '2ndFlrSF', 'SalePrice']].corr()

In [None]:
len(df[df['2ndFlrSF'] > 50]['2ndFlrSF']) / len(df)

In [None]:
data = df.copy()
data['2ndFlrSF'] = np.log(data['2ndFlrSF'])

In [None]:
plt.hist(data['2ndFlrSF'], bins = 50)
plt.show()