# 범주형 데이터 다루기

- nominal(명목형) : 순서가 없는 범주

- ordinal(순서형) : 순서가 있는 범주

### 순서가 없는 범주형 특성 인코딩

In [5]:
import numpy as np
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer

feature = np.array([['Texas'],['California'],['Texas'],['Delaware'],['Texas']])

one_hot = LabelBinarizer()

one_hot.fit_transform(feature)

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1]])

In [6]:
one_hot.classes_

array(['California', 'Delaware', 'Texas'], dtype='<U10')

In [7]:
one_hot.inverse_transform(one_hot.transform(feature))

array(['Texas', 'California', 'Texas', 'Delaware', 'Texas'], dtype='<U10')

In [8]:
import pandas as pd

pd.get_dummies(feature[:,0])

Unnamed: 0,California,Delaware,Texas
0,0,0,1
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


In [12]:
multiclass_feature = [('Texas','Florida'),('California','Alabama'),('Texas','Florida'),('Delware','Florida'),('Texas','Alabama')]

one_hot_multiclass = MultiLabelBinarizer()

one_hot_multiclass.fit_transform(multiclass_feature)

array([[0, 0, 0, 1, 1],
       [1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1],
       [0, 0, 1, 1, 0],
       [1, 0, 0, 0, 1]])

In [13]:
one_hot_multiclass.classes_

array(['Alabama', 'California', 'Delware', 'Florida', 'Texas'],
      dtype=object)

In [14]:
from sklearn.preprocessing import OneHotEncoder

feature = np.array([['Texas', 1],['California', 1],['Texas', 3],['Delaware', 1],['Texas', 1]])

one_hot_encoder = OneHotEncoder(sparse = False)
one_hot_encoder.fit_transform(feature)

array([[0., 0., 1., 1., 0.],
       [1., 0., 0., 1., 0.],
       [0., 0., 1., 0., 1.],
       [0., 1., 0., 1., 0.],
       [0., 0., 1., 1., 0.]])

In [15]:
one_hot_encoder.categories_

[array(['California', 'Delaware', 'Texas'], dtype='<U10'),
 array(['1', '3'], dtype='<U10')]

### 순서가 있는 범주형 특성 인코딩하기

In [5]:
import pandas as pd

dataframe = pd.DataFrame({"Score" : ["Low","Low","Medium","Medium","High"]})

scale_mapper = {"Low" : 1, "Medium" : 2, "High" : 3}

df = dataframe["Score"].replace(scale_mapper)

In [6]:
df

0    1
1    1
2    2
3    2
4    3
Name: Score, dtype: int64

In [7]:
dataframe = pd.DataFrame({"Score" : ["Low","Low","Medium","Medium","High","Barely More Than Medium"]})

scale_mapper = {"Low" : 1, "Medium" : 2, "Barely More Than Medium" : 3, "High" : 4}

dataframe["Score"].replace(scale_mapper)

0    1
1    1
2    2
3    2
4    4
5    3
Name: Score, dtype: int64

In [9]:
from sklearn.preprocessing import OrdinalEncoder
import numpy as np

features = np.array([["Low",10],["High", 50],["Medium", 3]])

ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit_transform(features)

array([[1., 0.],
       [0., 2.],
       [2., 1.]])

In [10]:
ordinal_encoder.categories_

[array(['High', 'Low', 'Medium'], dtype='<U6'),
 array(['10', '3', '50'], dtype='<U6')]

### 특정 딕셔너리를 인코딩하기

In [11]:
from sklearn.feature_extraction import DictVectorizer

data_dict = [{'Red' : 2, "Blue" : 4},
            {"Red" : 4, "Blue" : 3},
            {"Red" : 1, "Yellow" : 2},
            {"Red" : 2, "Yellow" : 2}]

dictvectorizer = DictVectorizer(sparse = False)

features = dictvectorizer.fit_transform(data_dict)

features

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

In [12]:
feature_names = dictvectorizer.get_feature_names()

feature_names

['Blue', 'Red', 'Yellow']

In [13]:
pd.DataFrame(features, columns = feature_names)

Unnamed: 0,Blue,Red,Yellow
0,4.0,2.0,0.0
1,3.0,4.0,0.0
2,0.0,1.0,2.0
3,0.0,2.0,2.0


### 누락된 클래스 값 대체하기

In [15]:
from sklearn.neighbors import KNeighborsClassifier

X = np.array([[0,2.10,1.45],
             [1,1.18,1.33],
             [0,1.22,1.27],
             [1,-0.21,-1.19]])

X_with_nan = np.array([[np.nan, 0.87, 1.31],[np.nan,-0.67,-0.22]])

clf = KNeighborsClassifier(3, weights = 'distance')
trained_model = clf.fit(X[:,1:],X[:,0])

imputed_values = trained_model.predict(X_with_nan[:,1:])

X_with_imputed = np.hstack((imputed_values.reshape(-1,1),X_with_nan[:,1:]))

np.vstack((X_with_imputed,X))

array([[ 0.  ,  0.87,  1.31],
       [ 1.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

In [21]:
from sklearn.impute import SimpleImputer 



X_complete = np.vstack((X_with_nan, X))

imputer = SimpleImputer(strategy = 'most_frequent')
imputer.fit_transform(X_complete)

array([[ 0.  ,  0.87,  1.31],
       [ 0.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

### 불균형한 클래스 다루기

In [22]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

iris = load_iris()

features = iris.data

target = iris.target

features = features[40:,:]
target = target[40:]

target = np.where((target == 0),0,1)

target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [24]:
weights = {0: .9, 1: 0.1}

RandomForestClassifier(class_weight = 'balanced')

RandomForestClassifier(class_weight='balanced')

In [26]:
i_class0 = np.where(target == 0)[0]
i_class1 = np.where(target == 1)[0]

n_class0 = len(i_class0)
n_class1 = len(i_class1)

i_class1_downsampled= np.random.choice(i_class1, size = n_class0, replace = False)

np.hstack((target[i_class0],target[i_class1_downsampled]))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [27]:
np.vstack((features[i_class0,:],features[i_class1_downsampled,:]))[0:5]

array([[5. , 3.5, 1.3, 0.3],
       [4.5, 2.3, 1.3, 0.3],
       [4.4, 3.2, 1.3, 0.2],
       [5. , 3.5, 1.6, 0.6],
       [5.1, 3.8, 1.9, 0.4]])

In [29]:
i_class0_upsampled = np.random.choice(i_class0,size = n_class1,replace = True)

np.concatenate((target[i_class0_upsampled],target[i_class1]))

np.vstack((features[i_class0_upsampled,:],features[i_class1,:]))[0:5]

array([[4.6, 3.2, 1.4, 0.2],
       [5.1, 3.8, 1.6, 0.2],
       [5.1, 3.8, 1.9, 0.4],
       [5. , 3.5, 1.3, 0.3],
       [5.3, 3.7, 1.5, 0.2]])