# Preprocessing

앞에서 분류한 것을 바탕으로, one-hot / scaling 등 실제 계산 진행

## 불러오기

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm;
import sklearn
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
train =  pd.read_csv('data/EDA/train_EDA_0910.csv')
test =  pd.read_csv('data/EDA/test_EDA_0910.csv')
feat_class = pd.read_csv('data/EDA/feat_class_0910.csv')

In [3]:
feat_raw = feat_class['raw'].dropna().values.tolist()
feat_dis = feat_class['dis'].dropna().values.tolist()
feat_map = feat_class['map'].dropna().values.tolist()
feat_onehot = feat_class['onehot'].dropna().values.tolist()
feat_extra = feat_class['extra'].dropna().values.tolist()
feature_del = feat_class['del'].dropna().values.tolist()

# 분류별

- 분류별 feature들을 불러와서 각 분류에 맞게 핸들링

### Raw

In [4]:
feat_raw

['LotArea',
 'MasVnrArea',
 'TotalBsmtSF',
 'GrLivArea',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch']

### discrete

In [5]:
feat_dis

['OverallQual',
 'OverallCond',
 'BsmtFullBath',
 'BsmtHalfBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars']

### map

In [6]:
feat_map

['ExterQual',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'HeatingQC',
 'CentralAir',
 'KitchenQual',
 'FireplaceQu',
 'GarageFinish',
 'GarageQual',
 'GarageCond']

### onehot

In [7]:
feat_onehot

['MSSubClass',
 'MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'LotConfig',
 'LandSlope',
 'Condition1',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'MasVnrType',
 'ExterCond',
 'Foundation',
 'Electrical',
 'Functional',
 'GarageType',
 'PavedDrive',
 'Fence',
 'SaleType',
 'SaleCondition']

In [8]:
# feature_map도 전부 onehot으로.
#feat_onehot.extend(feat_map)

In [9]:
train = pd.get_dummies(train, columns = feat_onehot)
test = pd.get_dummies(test, columns = feat_onehot)

### extra

In [10]:
feat_extra

['Neighborhood',
 'YearBuilt',
 'YearRemodAdd',
 'Exterior1st',
 'Exterior2nd',
 'GarageYrBlt',
 'MoSold',
 'YrSold']

neighbor, MoSold는 일단 원핫.

In [11]:
train = pd.get_dummies(train, columns = ['Neighborhood', 'MoSold'])
test = pd.get_dummies(test, columns = ['Neighborhood', 'MoSold'])

- 년도는 절대적인 년도보다 지어진지 얼마나 된 건물을 샀는지가 더 중요해보여서 지어진 년도 - 팔린 년도 로 계산
- 차고가 지어진 년도는 모르겟어서 일단 제거..

In [12]:
train['YearOld'] = (train['YearBuilt'] + train['YearRemodAdd'] )/2
train['YearOld'] -= train['YrSold']
test['YearOld'] = (test['YearBuilt'] + test['YearRemodAdd'] )/2
test['YearOld'] -= test['YrSold']

In [13]:
#Extra에서 따로 계산하고 지워줘야 할 컬럼 추가
feature_del.extend(['YearBuilt', 'YearRemodAdd',
                    'Exterior1st', 'Exterior2nd', 'YrSold', 'GarageYrBlt'])

## Delete

In [14]:
feature_del

['Id',
 'LotFrontage',
 'Utilities',
 'Condition2',
 'RoofMatl',
 'BsmtFinType1',
 'BsmtFinType2',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'Heating',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'MiscFeature',
 'PoolArea',
 'PoolQC',
 'MiscVal',
 'YearBuilt',
 'YearRemodAdd',
 'Exterior1st',
 'Exterior2nd',
 'YrSold',
 'GarageYrBlt']

In [15]:
train = train.drop(columns = feature_del)
test = test.drop(columns = feature_del)

One-hot 과정에서 train / test 둘 중 하나에만 존재하는 컬럼 제거

In [16]:
train_comp = list(set(train) - set(test)-{'SalePrice'})
test_comp = list(set(test)- set(train))
train = train.drop(columns = train_comp)
test = test.drop(columns = test_comp)

개수가 20개 이하인 one-hot column들 제거함.

In [17]:
small_samples = []
for feat in train.columns.to_list():
    if abs(train[feat]).sum() < 20:
        small_samples.append(feat)
len(small_samples)

49

In [18]:
train = train.drop(columns = small_samples)
test = test.drop(columns = small_samples)

## Min-Max Scaling

In [19]:
from sklearn.preprocessing import MinMaxScaler
minmax_scaler = MinMaxScaler()

In [20]:
train_X = train.drop('SalePrice', axis=1)
train_y = train['SalePrice']

In [21]:
X_scale = minmax_scaler.fit_transform(train_X)
train = pd.DataFrame(X_scale, columns=train_X.columns, index=list(train_X.index.values))
test_scale = minmax_scaler.fit_transform(test)
test = pd.DataFrame(test_scale, columns=test.columns, index=list(test.index.values))

In [22]:
train_y = np.log1p(train_y)

In [23]:
train['SalePrice'] = train_y
train

Unnamed: 0,LotArea,OverallQual,OverallCond,MasVnrArea,ExterQual,BsmtQual,BsmtCond,BsmtExposure,TotalBsmtSF,HeatingQC,...,MoSold_5,MoSold_6,MoSold_7,MoSold_8,MoSold_9,MoSold_10,MoSold_11,MoSold_12,YearOld,SalePrice
0,0.033420,0.666667,0.500,0.122500,0.666667,0.8,0.75,0.25,0.140098,1.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.940860,12.247699
1,0.038795,0.555556,0.875,0.000000,0.333333,0.8,0.75,1.00,0.206547,1.00,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.661290,12.109016
2,0.046507,0.666667,0.500,0.101250,0.666667,0.8,0.75,0.50,0.150573,1.00,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.924731,12.317171
3,0.038561,0.666667,0.500,0.000000,0.333333,0.6,1.00,0.25,0.123732,0.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.311828,11.849405
4,0.060576,0.777778,0.500,0.218750,0.666667,0.8,0.75,0.75,0.187398,1.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.908602,12.429220
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.030929,0.555556,0.500,0.000000,0.333333,0.8,0.75,0.25,0.155974,1.00,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.913978,12.072547
1456,0.055505,0.555556,0.625,0.074375,0.333333,0.8,0.75,0.25,0.252373,0.50,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.704301,12.254868
1457,0.036187,0.666667,1.000,0.000000,1.000000,0.6,1.00,0.25,0.188543,1.00,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.602151,12.493133
1458,0.039342,0.444444,0.625,0.000000,0.333333,0.6,0.75,0.50,0.176432,0.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.596774,11.864469


In [24]:
train.to_csv('data/preprocess/train_0910.csv', index = False)
test.to_csv('data/preprocess/test_0910.csv', index = False)