## confirm missing values

In [72]:
%matplotlib inline
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import scipy.stats as st
import sklearn.linear_model as linear_model
import seaborn as sns
from sklearn.model_selection import KFold
from IPython.display import HTML, display
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

train = pd.read_csv('C:/Users/dohan/Desktop/house_prj/house-price-project/data/train.csv')
test = pd.read_csv('C:/Users/dohan/Desktop/house_prj/house-price-project/data/test.csv')

In [3]:
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Unnamed: 0,Total,Percent
PoolQC,1453,0.995205
MiscFeature,1406,0.963014
Alley,1369,0.937671
Fence,1179,0.807534
FireplaceQu,690,0.472603
LotFrontage,259,0.177397
GarageCond,81,0.055479
GarageType,81,0.055479
GarageYrBlt,81,0.055479
GarageFinish,81,0.055479


## 1. GarageX (GarageCond, GarageType, GarageYrBlt, GarageFinish, GarageQual)

### 결측값 판단

차고가 없는 집을 NaN으로 처리. 결측값은 없다.

In [8]:
garage_cols = ['GarageYrBlt','GarageCond','GarageFinish','GarageQual','GarageType']
train[garage_cols][train['GarageYrBlt'].isnull()]

Unnamed: 0,GarageYrBlt,GarageCond,GarageFinish,GarageQual,GarageType
39,,,,,
48,,,,,
78,,,,,
88,,,,,
89,,,,,
99,,,,,
108,,,,,
125,,,,,
127,,,,,
140,,,,,


### 처리 방안
Nan을 None으로 변경한다.

In [9]:
for cols in garage_cols:
    if train[cols].dtype==np.object:
        train.loc[train[cols].isnull(), cols] = 'None'
    else:
        train.loc[train[cols].isnull(), cols] = 0

## 2. LotFrontage


### 결측값 판단
LotArea, LotConfig, LotShape에는 missing value가 없다. 하지만 LotFrontage에 259개의 missing value가 존재한다. 따라서 이는 결측값이다.

### 처리방안 1
LotFrontage의 median 값을 활용

In [63]:
frontage_median = train['LotFrontage'].median()
train['LotFrontage'].fillna(frontage_median, inplace=True)
train[['LotFrontage', 'LotArea']]

Unnamed: 0,LotFrontage,LotArea
0,65.0,8450
1,80.0,9600
2,68.0,11250
3,60.0,9550
4,84.0,14260
5,85.0,14115
6,75.0,10084
7,69.0,10382
8,51.0,6120
9,50.0,7420


### 처리방안 2
LotFrontage는 집터와 접한 땅의 길이이다. LotArea는 집터 면적이다.
따라서 LotArea가 클수록 LotFrontage값도 클 것이라 가정하고, LotArea의 sqare root 값을 LotFrontage의 missing value에 넣어준다.

In [75]:
isnullLotFrontage = train["LotFrontage"].isnull()
sqrtLotArea = np.sqrt(train.loc[isnullLotFrontage].LotArea)
train.loc[isnullLotFrontage, "LotFrontage"] = np.sqrt(train.loc[isnullLotFrontage, "LotArea"])
train[['LotFrontage', 'LotArea']]

Unnamed: 0,LotFrontage,LotArea
0,65.000000,8450
1,80.000000,9600
2,68.000000,11250
3,60.000000,9550
4,84.000000,14260
5,85.000000,14115
6,75.000000,10084
7,101.892100,10382
8,51.000000,6120
9,50.000000,7420
