# 불균형한 클래스 처리하기

실전에서는 불균형한 클래스의 경우가 많다. 예를 들어 희귀함의 경우 샘플의 수가 매우 적을 수 밖에 없다. 이런 이유 때문에 불균형한 클래스를 다루는 일은 머신러닝에서 흔하다.  
가장 좋은 방법의 소수 클래스의 샘플을 더 많이 모으는 것이지만 이것이 불가능한 경우가 많기 때문에 다른 선택 사항을 고려해야 한다.  

# 데이터셋 로드

고객이 자동차 보험에 관심을 가질지 여부를 예측하는 모델을 구축해 보자.   

Data Source: Kaggle - Learning from Imbalanced Insurance Data.  
참고) https://www.kaggle.com/datasets/arashnic/imbalanced-data-practice

해당 데이터는 다음과 같은 정보를 가지고 있다.  
- Demographics (gender, age, region code type),
- Vehicles (Vehicle Age, Damage),
- Policy (Premium, sourcing channel) etc.

우리가 예측하고자 하는 label은 Response 변수이다.
- target : Response (0:관심없음 / 1:관심있음)

이 데이터에서 Response는 imbalanced class 이다.  

In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv('https://raw.githubusercontent.com/aonekoda/reference/main/data/InsuranceData.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171969 entries, 0 to 171968
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    171969 non-null  int64  
 1   Gender                171969 non-null  object 
 2   Age                   171969 non-null  int64  
 3   Driving_License       171969 non-null  int64  
 4   Region_Code           171969 non-null  float64
 5   Previously_Insured    171969 non-null  int64  
 6   Vehicle_Age           171969 non-null  object 
 7   Vehicle_Damage        171969 non-null  object 
 8   Annual_Premium        171969 non-null  float64
 9   Policy_Sales_Channel  171969 non-null  float64
 10  Vintage               171969 non-null  int64  
 11  Response              171969 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 15.7+ MB


In [None]:
data.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,361628,Male,45,1,28.0,0,1-2 Year,Yes,30269.0,13.0,226,0
1,163178,Female,23,1,31.0,0,< 1 Year,Yes,2630.0,152.0,71,0
2,17847,Female,48,1,3.0,1,1-2 Year,No,25355.0,26.0,175,0
3,325760,Male,42,1,29.0,1,1-2 Year,No,48779.0,124.0,17,0
4,416289,Male,26,1,15.0,1,< 1 Year,No,35444.0,152.0,89,0


In [None]:
# 범주형 변수를 one-hot encoding
data = pd.get_dummies(data, ['Gender', 'Vehicle_Age', 'Vehicle_Damage']) #one-hot-encoding for Categorical Columns

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171969 entries, 0 to 171968
Data columns (total 16 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   id                     171969 non-null  int64  
 1   Age                    171969 non-null  int64  
 2   Driving_License        171969 non-null  int64  
 3   Region_Code            171969 non-null  float64
 4   Previously_Insured     171969 non-null  int64  
 5   Annual_Premium         171969 non-null  float64
 6   Policy_Sales_Channel   171969 non-null  float64
 7   Vintage                171969 non-null  int64  
 8   Response               171969 non-null  int64  
 9   Gender_Female          171969 non-null  bool   
 10  Gender_Male            171969 non-null  bool   
 11  Vehicle_Age_1-2 Year   171969 non-null  bool   
 12  Vehicle_Age_< 1 Year   171969 non-null  bool   
 13  Vehicle_Age_> 2 Years  171969 non-null  bool   
 14  Vehicle_Damage_No      171969 non-nu

# Check the Class Imbalance

In [None]:
res_1 = data[data['Response'] == 1 ].shape[0]
res_0 = data[data['Response'] == 0 ].shape[0]

print("Response == 1 : ", res_1)
print("Response == 0 : ", res_0)
print("Proportion of Minority Class: ", round(res_1/res_0, 4)*100, "%")

Response == 1 :  28252
Response == 0 :  143717
Proportion of Minority Class:  19.66 %


# 데이터 전처리

In [None]:
from sklearn.model_selection import train_test_split

X = data.drop(columns= ['id', 'Response']) #Columns to be excluded from training set
y = data['Response'] #defining the Response as y

X.columns = ['Age', 'Driving_License', 'Region_Code', 'Previously_Insured',
       'Annual_Premium', 'Policy_Sales_Channel', 'Vintage', 'Gender_Female',
       'Gender_Male', 'Vehicle_Age_1_2_Year', 'Vehicle_Age_lessthan_1_Year',
       'Vehicle_Age_greaterthan_2_Years', 'Vehicle_Damage_No', 'Vehicle_Damage_Yes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
y_train.value_counts()

Unnamed: 0_level_0,count
Response,Unnamed: 1_level_1
0,100602
1,19776


In [None]:
y_test.value_counts()

Unnamed: 0_level_0,count
Response,Unnamed: 1_level_1
0,43115
1,8476


# 분류모형 생성

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)

In [None]:
pred = tree.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

print("Accuracy of Test Set:", round(accuracy_score(y_test, pred) * 100, 2 ), "%")

Accuracy of Test Set: 94.79 %


In [None]:
conf_matrix = confusion_matrix(y_true=y_test, y_pred=pred)
conf_matrix

array([[41739,  1376],
       [ 1311,  7165]])

In [None]:
precision_original =  round(precision_score(y_test, pred), 4)
recall_original = round(recall_score(y_test, pred), 4)
F1_score_original =  round(f1_score(y_test, pred), 4)

print('Precision = ', precision_original)
print('Recall = ', recall_original)
print('F1_score = ', F1_score_original) # Precision과 Recall의 조화 평균

Precision =  0.8389
Recall =  0.8453
F1_score =  0.8421


# UpSampling

* 업샘플링 : 소수 클래스의 샘플 수를 늘린다.

In [None]:
from sklearn.utils import resample

In [None]:
print('업샘플링 전 클래스 1의 샘플 개수 : ', X_train[y_train==1].shape[0] )

업샘플링 전 클래스 1의 샘플 개수 :  19776


In [None]:
X_res_1, y_res_1 = resample(X_train[y_train==1],
                            y_train[y_train==1],
                            replace= True,
                            n_samples=X_train[y_train==0].shape[0],
                            random_state=123)

In [None]:
print('업샘플링 후 클래스 1의 샘플 개수 : ', X_res_1.shape[0])

업샘플링 후 클래스 1의 샘플 개수 :  100602


In [None]:
# 업샘플링된 결과를 훈련데이터와 합친다.
X_train_upsampled = pd.concat([X_res_1, X_train[y_train==0]])
y_train_upsampled = pd.concat([y_res_1, y_train[y_train==0]])

In [None]:
y_train_upsampled.value_counts() # 업샘플링된 결과

Unnamed: 0_level_0,count
Response,Unnamed: 1_level_1
1,100602
0,100602


In [None]:
tree_upsampled = DecisionTreeClassifier(random_state=0)
tree_upsampled.fit(X_train_upsampled, y_train_upsampled)

In [None]:
pred_upsampled = tree_upsampled.predict(X_test)

In [None]:
confusion_matrix(y_true=y_test, y_pred=pred_upsampled)

array([[41873,  1242],
       [ 1356,  7120]])

In [None]:
print("Accuracy of Test Set:", round(accuracy_score(y_test, pred_upsampled) * 100, 2 ), "%")

Accuracy of Test Set: 94.96 %


In [None]:
precision_upsampled =  round(precision_score(y_test, pred_upsampled), 4)
recall_upsampled = round(recall_score(y_test, pred_upsampled), 4)
F1_upsampled =  round(f1_score(y_test, pred_upsampled), 4)

print('Precision = ', precision_upsampled)
print('Recall = ', recall_upsampled)
print('F1_score = ', F1_upsampled)

Precision =  0.8515
Recall =  0.84
F1_score =  0.8457


# DownSampling
* 다운샘플링 : 다수 클래스의 샘플 수를 줄인다.

In [None]:
X_res_0, y_res_0 = resample(X_train[y_train==0],
                            y_train[y_train==0],
                            replace= False,
                            n_samples=X_train[y_train==1].shape[0],
                            random_state=123)

In [None]:
# 업샘플링된 결과를 훈련데이터와 합친다.
X_train_downsampled = pd.concat([X_res_0, X_train[y_train==1]])
y_train_downsampled = pd.concat([y_res_0, y_train[y_train==1]])

In [None]:
y_train_downsampled.value_counts() # 다운샘플링된 결과

Unnamed: 0_level_0,count
Response,Unnamed: 1_level_1
0,19776
1,19776


In [None]:
tree_downsampled = DecisionTreeClassifier(random_state=0)
tree_downsampled.fit(X_train_downsampled, y_train_downsampled)

In [None]:
pred_downsampled = tree_downsampled.predict(X_test)

In [None]:
confusion_matrix(y_true=y_test, y_pred=pred_downsampled)

array([[36883,  6232],
       [  798,  7678]])

In [None]:
print("Accuracy of Test Set:", round(accuracy_score(y_test, pred_downsampled) * 100, 2 ), "%")

Accuracy of Test Set: 86.37 %


In [None]:
precision_downsampled =  round(precision_score(y_test, pred_downsampled), 4)
recall_downsampled = round(recall_score(y_test, pred_downsampled), 4)
F1_downsampled =  round(f1_score(y_test, pred_downsampled), 4)

print('Precision = ', precision_downsampled)
print('Recall = ', recall_downsampled)
print('F1_score = ', F1_downsampled)

Precision =  0.552
Recall =  0.9059
F1_score =  0.686


# 리샘플링 결과 비교

In [None]:
metrics_original = [precision_original, recall_original, F1_score_original]
metrics_upsampled = [precision_upsampled, recall_upsampled, F1_upsampled]
metrics_downsampled = [precision_downsampled, recall_downsampled, F1_downsampled]

metrics = pd.DataFrame(list(zip(metrics_original, metrics_upsampled, metrics_downsampled)))

metrics.index = ['Precision', 'Recall', 'F1_score']
metrics.columns = ['Original Dataset', 'Upsampled Dataset', 'Downsampled Dataset']

metrics

Unnamed: 0,Original Dataset,Upsampled Dataset,Downsampled Dataset
Precision,0.8389,0.8515,0.552
Recall,0.8453,0.84,0.9059
F1_score,0.8421,0.8457,0.686


# 실습 하기

- DecisionTreeClassifier 외에 다른 분류 모형으로 훈련해보고 결과를 확인해 보시오.
- 분류 모형 생성시 class_weight 옵션을 사용해서 분류 모형의 성능을 확인해 보시오.