In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

In [None]:
df = pd.read_csv('data/train.csv')

In [None]:
df.head()

## Target

In [None]:
df['SalePrice'] = np.log(df['SalePrice'])

## Land Property
LotFrontage, LotArea, LotShape, LandContour, LandSlope

In [None]:
land_features = df[['LotFrontage', 'LotArea', 'LotShape', 'LandContour', 'LandSlope']]
land_features

### LotFrontage

In [None]:
df['LotFrontage'] = df['LotFrontage'].fillna(value = df['LotFrontage'].median())

In [None]:
df['LotFrontage'].isna().sum()

In [None]:
data = land_features.copy()

In [None]:
df['LotFrontage'] = np.log(df['LotFrontage'])

In [None]:
corr, p_val = stats.pearsonr(df['LotFrontage'], df['SalePrice'])
print("corr : ", corr, "\np_val : ", p_val)

In [None]:
p_val < 0.05

### LotArea

In [None]:
df['LotArea'] = np.log(df['LotArea'])

In [None]:
df[['LotArea', 'LotFrontage', 'SalePrice']].corr()

In [None]:
corr, p_val = stats.pearsonr(df['LotArea'], df['SalePrice'])
print("corr : ", corr, "\np_val : ", p_val)

In [None]:
p_val < 0.05

### Checking Corr between independent features

In [None]:
df[['LotArea', 'LotFrontage']].corr()

Drop Lot Frontage

In [None]:
df['LotShape'].unique()

In [None]:
g1 = df[df['LotShape'] == 'Reg']['SalePrice']
g2 = df[df['LotShape'] == 'IR1']['SalePrice']
g3 = df[df['LotShape'] == 'IR2']['SalePrice']
g4 = df[df['LotShape'] == 'IR3']['SalePrice']

In [None]:
f_stat, p_val = stats.f_oneway(g1, g2, g3, g4)
print(f_stat, p_val)

In [None]:
p_val < 0.05

In [None]:
df['LandSlope'].unique()

In [None]:
g1 = df[df['LandSlope'] == 'Gtl']['LotArea']
g2 = df[df['LandSlope'] == 'Mod']['LotArea']
g3 = df[df['LandSlope'] == 'Sev']['LotArea']

In [None]:
f_stat, p_val = stats.f_oneway(g1, g2, g3, g4)
print(f_stat, p_val)

# Basement Features


In [None]:
basement_features = df[['BsmtQual',
'BsmtCond',
'BsmtExposure',
'BsmtFinType1',
'BsmtFinSF1',
'BsmtFinType2',
'BsmtFinSF2',
'BsmtUnfSF',
'TotalBsmtSF',
'BsmtFullBath',
'BsmtHalfBath',]]

In [None]:
basement_features

In [None]:
data = basement_features.copy()

In [None]:
plt.hist(df['BsmtFinSF1'], bins = 40)
plt.show()

In [None]:
corr, p_val = stats.pearsonr(df['BsmtFinSF1'], df['TotalBsmtSF'])
print(corr, p_val)

In [None]:
p_val < 0.05

In [None]:
df['BsmtFullBath'].unique()

In [None]:
df['BsmtQual'] = df['BsmtQual'].fillna(value = 'No Basement')
df['BsmtCond'] = df['BsmtCond'].fillna(value = 'No Basement')
df['BsmtExposure'] = df['BsmtExposure'].fillna(value = 'No Basement')
df['BsmtFinType2'] = df['BsmtFinType2'].fillna(value = 'No Basement')


In [17]:

def anova_groups(d_feature, c_feature, ignore = ''):
    groups = []
    for cat in df[d_feature].unique():
        if cat != ignore:
            print(cat)
            df[df[d_feature] == cat][c_feature]
            groups.append(df[df[d_feature] == cat][c_feature])

    return groups
    
    

In [None]:
groups = anova_groups(d_feature = 'KitchenQual', c_feature = 'SalePrice')

In [None]:
f_stat, p_val = stats.f_oneway(groups[0], groups[1], groups[2])
print(f_stat, p_val)

BasementQual
300.3923243191743 2.031281939790327e-188

BsmtCond
34.504086533842255 1.4927541408939521e-27

BsmtExposure
60.49458686046996 2.590707192162915e-47

BsmtHalfBath
0.030936149052788492 0.9695381138187172

BsmtFinType2
29.61917200515036 2.6062698303303355e-13

BsmtFullBath
29.595109471751318 1.4248324941269371e-18

KitchenQual
569.1052396037583 3.568211071739858e-182



In [None]:
observed = pd.crosstab(
    df[df['BsmtQual'] != 'No Basement']['BsmtQual'], 
    df[df['BsmtExposure'] != 'No Basement' ]['BsmtExposure']
)
observed

In [None]:
chi2_stats, p_val, dof, expected = stats.chi2_contingency(observed)

In [None]:
expected < 5

In [None]:
p_val

# Building Property

In [None]:
building_properties = df[[
    'BldgType',
    'RoofStyle',
    'RoofMatl',
    '1stFlrSF',
    '2ndFlrSF',
    'LowQualFinSF',
    'GrLivArea',
    'FullBath',
    'HalfBath',
    'BedroomAbvGr',
    'KitchenAbvGr',
    'KitchenQual',
    'Fence'
]]

In [None]:
building_properties.head()

In [None]:
df['BldgType'].unique()

In [None]:
groups = anova_groups(d_feature = 'BsmtCond', c_feature = 'SalePrice', ignore = '')

In [None]:
f_stat, p_val = stats.f_oneway(groups[0], groups[1], groups[2], groups[3], groups[4])
print(f_stat, p_val)

In [None]:
df['Fence'] = df['Fence'].fillna('0')

In [None]:
df.loc[df['Fence'] != '0', 'Fence'] = '1'

In [None]:
feature = 'Fence'
df.groupby(feature)['SalePrice'].median().plot.bar()
plt.xlabel(feature)
plt.ylabel('Sale Price')
plt.title(feature)
plt.show()

In [None]:
df['BsmtExposure'].value_counts()

In [None]:
observed = pd.crosstab(df['RoofStyle'], df['RoofMatl'])
observed

In [None]:
df['RoofStyle'].unique()

In [None]:
chi2_stats, p_val, dof, expected = stats.chi2_contingency(observed)

In [None]:
expected < 5

In [None]:
p_val < 0.05

In [None]:
df.loc[df['RoofStyle'] != 'Gable', 'RoofStyle'] = 'Others'

In [None]:
df.loc[df['RoofMatl'] != 'CompShg', 'RoofMatl'] = 'Others'

In [None]:
observed

In [None]:
g1 = df[df['RoofStyle'] == 'Gable']['SalePrice']
g2 = df[df['RoofStyle'] == 'Others']['SalePrice']

In [None]:
t_stat, p_value = stats.ttest_ind(g1, g2)
print(t_stat, p_val)

In [None]:
corr, p_val = stats.pearsonr(,)
print(corr, p_val)

BldgType
15.211667391724271 3.4367936403015973e-12

BsmtCond
19.70813904568719 8.195793756122466e-16

Fence
16.10290232659194 6.560318639536422e-13

GrLivArea
0.7086244776126521 4.518033646779945e-223

1stFlrSF
0.6058521846919146 5.394710618971284e-147

2ndFlrSF
0.3193338028320678 5.764335119183061e-36

## Heating & Airconditioning

In [5]:
ha = df[['Heating', 'CentralAir','Electrical','Fireplaces','FireplaceQu']].copy()

In [6]:
ha.head()

Unnamed: 0,Heating,CentralAir,Electrical,Fireplaces,FireplaceQu
0,GasA,Y,SBrkr,0,
1,GasA,Y,SBrkr,1,TA
2,GasA,Y,SBrkr,1,TA
3,GasA,Y,SBrkr,1,Gd
4,GasA,Y,SBrkr,1,TA


### Heating

In [7]:
ha['Heating'].isna().sum()

0

In [9]:
df['Heating'].unique()

array(['GasA', 'GasW', 'Grav', 'Wall', 'OthW', 'Floor'], dtype=object)

In [10]:
ha.Heating.value_counts()

GasA     1428
GasW       18
Grav        7
Wall        4
OthW        2
Floor       1
Name: Heating, dtype: int64

### Central AIr

In [11]:
df['CentralAir'].isna().sum()

0

In [12]:
df['CentralAir'].value_counts()

Y    1365
N      95
Name: CentralAir, dtype: int64

### Electrical

In [13]:
df['Electrical'].isna().sum()

1

In [14]:
df['Electrical'].value_counts()

SBrkr    1334
FuseA      94
FuseF      27
FuseP       3
Mix         1
Name: Electrical, dtype: int64

### Fireplaces

In [15]:
df['Fireplaces'].isna().sum()

0

In [16]:
df['Fireplaces'].value_counts()

0    690
1    650
2    115
3      5
Name: Fireplaces, dtype: int64

In [18]:
groups = anova_groups(d_feature = 'Fireplaces', c_feature = 'SalePrice', ignore = '')

0
1
2
3


In [19]:
f_stat, p_val = stats.f_oneway(groups[0], groups[1], groups[2], groups[3])
print(f_stat, p_val)

146.74117584536646 4.4210010307257784e-83


In [20]:
df.loc[df['Fireplaces'] != 0, 'Fireplaces'] = 1

In [21]:
df['Fireplaces'].value_counts()

1    770
0    690
Name: Fireplaces, dtype: int64

In [22]:
g1 = df[df['Fireplaces'] == 0]['SalePrice']
g2 = df[df['Fireplaces'] == 1]['SalePrice']
t_stat, p_value = stats.ttest_ind(g1, g2)
print(t_stat, p_val)

-20.438125225454556 4.4210010307257784e-83


### FireplaceQu

In [23]:
df['FireplaceQu'].isna().sum()

690

In [24]:
df['FireplaceQu'].value_counts()

Gd    380
TA    313
Fa     33
Ex     24
Po     20
Name: FireplaceQu, dtype: int64

In [26]:
df['FireplaceQu'] = df['FireplaceQu'].fillna(value = 'Unknown')

In [27]:
df['FireplaceQu'].value_counts()

Unknown    690
Gd         380
TA         313
Fa          33
Ex          24
Po          20
Name: FireplaceQu, dtype: int64

In [28]:
groups = anova_groups(d_feature = 'FireplaceQu', c_feature = 'SalePrice', ignore = '')

Unknown
TA
Gd
Fa
Ex
Po


In [30]:
f_stat, p_val = stats.f_oneway(groups[0], groups[1], groups[2], groups[3], groups[4], groups[5])
print(f_stat, p_val)

121.07512126691806 2.9712169727633336e-107


### Checking Corr Between Fireplaces and FIreplaceQu

In [31]:
observed = pd.crosstab(df['Fireplaces'], df['FireplaceQu'])
observed

FireplaceQu,Ex,Fa,Gd,Po,TA,Unknown
Fireplaces,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,0,0,0,0,690
1,24,33,380,20,313,0


In [32]:
stat, p_val, dof, exp = stats.chi2_contingency(observed)

In [33]:
dof

5

In [36]:
exp > 5

array([[ True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True]])

In [37]:
p_val

0.0