## Categorical Variable

user['직업'].unique()
> ['선생님','마케터','대표','개발자']
이 때 '직업' 컬럼은 4가지의 속성 값들로 이루어져있는데 이대로 머신러닝 돌리면 오류발생

처리 방법

1. 컬럼 drop
2. Label Encoding
3. One-Hot-Encoding

MAE(mean absolute error)으로 어떤 방법이 가장 적합한지 확인

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [3]:
X=pd.read_csv("../data/HousingPrice/train.csv")
X_test=pd.read_csv("../data/HousingPrice/test.csv")

In [5]:
X.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
X_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [12]:
X.dropna(axis=0,subset=['SalePrice'],inplace=True)
y=X['SalePrice']
X.drop(['SalePrice'],axis=1,inplace=True)

In [13]:
cols_with_missing = [col for col in X.columns if X[col].isnull().any()]
X.drop(cols_with_missing,axis=1,inplace=True)
X_test.drop(cols_with_missing,axis=1,inplace=True)

In [14]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y,train_size=0.8, test_size=0.2, random_state=0)

In [15]:
X_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
618,619,20,RL,11694,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,108,0,0,260,0,0,7,2007,New,Partial
870,871,20,RL,6600,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,0,8,2009,WD,Normal
92,93,30,RL,13360,Pave,IR1,HLS,AllPub,Inside,Gtl,...,0,44,0,0,0,0,8,2009,WD,Normal
817,818,20,RL,13265,Pave,IR1,Lvl,AllPub,CulDSac,Gtl,...,59,0,0,0,0,0,7,2008,WD,Normal
302,303,20,RL,13704,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,81,0,0,0,0,0,1,2006,WD,Normal


In [17]:
#MAE 함수
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100,random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid,preds)

## 1. Drop columns with categorical data

- use select_dtypes

In [18]:
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

In [20]:
score_dataset(drop_X_train,drop_X_valid,y_train,y_valid)

17952.591404109586

## 2. Label encoding

체크사항!!

train 데이터셋과 test 데이터셋에서 라벨인코딩 해주려고하는 컬럼의 유니크 값을 확인해보기

In [22]:
print("Unique values in 'Condition2' column in training data:", X_train['Condition2'].unique())
print("\nUnique values in 'Condition2' column in validation data:", X_valid['Condition2'].unique())

Unique values in 'Condition2' column in training data: ['Norm' 'PosA' 'Feedr' 'PosN' 'Artery' 'RRAe']

Unique values in 'Condition2' column in validation data: ['Norm' 'RRAn' 'RRNn' 'Artery' 'Feedr' 'PosN']


'PosA','RRAe' 값은 train 데이터셋엔 있지만, test 데이터 셋엔 없기 때문에 라벨 인코딩 시  오류 발생 시킴

따라서 컬럼 값의 집합이 다른 컬럼은 Drop 시킴

In [23]:
object_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']

good_label_cols = [col for col in object_cols if
                  set(X_train[col]) == set(X_valid[col])]

bad_label_cols = list(set(object_cols)-set(good_label_cols))

In [24]:
good_label_cols

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'LotConfig',
 'BldgType',
 'HouseStyle',
 'ExterQual',
 'CentralAir',
 'KitchenQual',
 'PavedDrive',
 'SaleCondition']

In [25]:
bad_label_cols

['Exterior2nd',
 'Foundation',
 'RoofStyle',
 'Heating',
 'Functional',
 'Condition1',
 'ExterCond',
 'Neighborhood',
 'LandSlope',
 'HeatingQC',
 'Exterior1st',
 'RoofMatl',
 'Condition2',
 'SaleType',
 'Utilities']

In [29]:
from sklearn.preprocessing import LabelEncoder

label_X_train = X_train.drop(bad_label_cols,axis=1)
label_X_valid = X_valid.drop(bad_label_cols,axis=1)

label_encoder = LabelEncoder()
for col in set(good_label_cols):
    label_X_train[col] = label_encoder.fit_transform(X_train[col])
    label_X_valid[col] = label_encoder.transform(X_valid[col])

In [30]:
score_dataset(label_X_train, label_X_valid, y_train, y_valid)

17675.942500000005

## 3. One-Hot-Encode

*주의: unique() 값이 큰 (Cardinality 큰) 컬럼을 one-hot-encode 하게 되면 데이터 프레임이 커지므로 제외해준다

In [31]:
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]

In [32]:
low_cardinality_cols

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'KitchenQual',
 'Functional',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [37]:
high_cardinality_cols=list(set(object_cols) - set(low_cardinality_cols))

In [38]:
high_cardinality_cols

['Exterior2nd', 'Exterior1st', 'Neighborhood']

In [39]:
from sklearn.preprocessing import OneHotEncoder

In [42]:
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))

In [46]:
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

In [47]:
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

In [48]:
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)


In [49]:
print("MAE from Approach 3 (One-Hot Encoding):") 
print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))

MAE from Approach 3 (One-Hot Encoding):
17514.224246575344
