In [1]:
# mieyhgnaj set 🤪

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style("white")

import missingno as msno
import plotly.express as px
from matplotlib import cm
from matplotlib import style
from matplotlib import font_manager
from matplotlib import rc

rc('font', family='Arial Unicode MS')

import warnings
warnings.filterwarnings("ignore")

## 🚙 `타깃 컬럼을 제외한 컬럼은 SOCAR 측의 요청대로 비공개 처리!`

### 0. encoding ver.

In [34]:
train = pd.read_csv('datas/train_encode_ver2.csv', index_col=0)
test = pd.read_csv('datas/test_encode_ver2.csv', index_col=0)

In [3]:
X_train = train.drop(columns=['fraud_YN'])
X_test = test.drop(columns=['fraud_YN'])
y_train = train.fraud_YN
y_test = test.fraud_YN

In [4]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((12879, 50), (3121, 50), (12879,), (3121,))

In [9]:
y_train.value_counts()

0    12845
1       34
Name: fraud_YN, dtype: int64

# re-sampling
- Oversampling: Duplicating samples from the minority class
- Undersampling: Deleting samples from the majority class

### 1. Naive random over-sampling
: generate new samples by randomly sampling with replacement the current available samples

In [10]:
from collections import Counter

In [11]:
from imblearn.over_sampling import RandomOverSampler

random = RandomOverSampler(random_state=4)
X_resampled, y_resampled = random.fit_resample(X_train, y_train)

print(random.__class__.__name__, "\n")
print(sorted(Counter(y_resampled).items()))

RandomOverSampler 

[(0, 12845), (1, 12845)]


In [12]:
print(X_resampled[-5:])

       x2  x4  x5   x7  x8  x9  x10          x14      x15  x17  ...  x16_0  \
25685   0   0   2  100   2   0    0       0.0000  1014810    0  ...      0   
25686   0   0   2  100   1   0    1  183486.5085  1110260    1  ...      1   
25687   0   1   4  100   1   0    0  323583.5555        0    0  ...      0   
25688   0   0   1  100   3   0    0  218102.5032        0    1  ...      0   
25689   0   0   1  100   1   0    0       0.0000        0    1  ...      0   

       x16_1  x16_2  x16_3  x16_4  x16_5  x20_0  x20_1  x20_2  x20_3  
25685      0      1      0      0      0      1      0      0      0  
25686      0      0      0      0      0      0      1      0      0  
25687      1      0      0      0      0      1      0      0      0  
25688      0      1      0      0      0      1      0      0      0  
25689      1      0      0      0      0      1      0      0      0  

[5 rows x 50 columns]


In [13]:
y_resampled.value_counts()

0    12845
1    12845
Name: fraud_YN, dtype: int64

### 2. SMOTE
: First it finds the n-nearest neighbors in the minority class for each of the samples in the class. Then it draws a line between the the neighbors an generates random points on the lines.

In [14]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))

[(0, 12845), (1, 12845)]


### 3. ADASYN
: Improved version of Smote.

In [16]:
from imblearn.over_sampling import ADASYN

adasyn = ADASYN()
X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train)

print(adasyn.__class__.__name__, "\n")
print(sorted(Counter(y_resampled).items()))

ADASYN 

[(0, 12845), (1, 12852)]


In [17]:
print(X_resampled[-5:])

       x2  x4  x5   x7  x8  x9  x10            x14  x15  x17  ...  x16_0  \
25692   0   0   1  100   2   0    0  282607.796957    0    1  ...      0   
25693   0   0   1  100   1   0    0  241404.852802    0    1  ...      0   
25694   0   0   1  100   1   0    0  192805.041716    0    1  ...      0   
25695   0   0   2   88   1   0    0  416711.685078    0    0  ...      0   
25696   0   0   2   54   1   0    0  381640.584525    0    0  ...      0   

       x16_1  x16_2  x16_3  x16_4  x16_5  x20_0  x20_1  x20_2  x20_3  
25692      0      0      0      0      0      0      0      0      0  
25693      0      0      0      0      0      0      0      0      0  
25694      0      0      0      0      0      0      0      0      0  
25695      0      0      0      0      0      0      1      0      0  
25696      0      0      0      0      0      0      1      0      0  

[5 rows x 50 columns]


In [18]:
y_resampled.value_counts()

1    12852
0    12845
Name: fraud_YN, dtype: int64

### 4. BorderlineSMOTE
: Unlike with the SMOTE, where the synthetic data are created randomly between the two data, Borderline-SMOTE only makes synthetic data along the decision boundary between the two classes.

In [19]:
from imblearn.over_sampling import BorderlineSMOTE

border = BorderlineSMOTE()
X_resampled, y_resampled = border.fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))

[(0, 12845), (1, 12845)]


In [20]:
print(X_resampled[-10:])

       x2  x4  x5   x7  x8  x9  x10  x14     x15  x17  ...  x16_0  x16_1  \
25680   0   0   1  100   1   0    0  0.0  345373    0  ...      0      0   
25681   0   0   1  100   1   0    0  0.0  367116    0  ...      1      0   
25682   0   0   1  100   1   0    0  0.0   81616    0  ...      0      0   
25683   0   0   1  100   1   0    0  0.0   89651    0  ...      1      0   
25684   0   0   3  100   1   0    0  0.0  352526    0  ...      0      0   
25685   0   0   1  100   1   0    0  0.0  357126    0  ...      1      0   
25686   0   0   1  100   1   0    0  0.0  356940    0  ...      1      0   
25687   0   0   1  100   1   0    0  0.0   14450    0  ...      0      0   
25688   0   0   1  100   1   0    0  0.0   41612    0  ...      0      0   
25689   0   0   1  100   1   0    0  0.0  359341    0  ...      1      0   

       x16_2  x16_3  x16_4  x16_5  x20_0  x20_1  x20_2  x20_3  
25680      0      0      0      0      0      0      0      0  
25681      0      0      0      0  

In [21]:
y_resampled.value_counts()

0    12845
1    12845
Name: fraud_YN, dtype: int64

### 5. SMOTENC
: Synthetic Minority Over-sampling Technique for Nominal and Continuous.\
: Unlike SMOTE, SMOTE-NC for dataset containing numerical and categorical features. However, it is not designed to work with only categorical features.

In [13]:
# categorical_features = []
# for i,j in enumerate(list(X_train.columns)):
#     categorical_features.append(i)

In [14]:
# categorical_features.remove(7) # 14

In [15]:
# categorical_features.remove(8) # 15

In [23]:
categorical_features = [0, 1, 2, 3, 4, 5, 6,
                        9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
                        27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
                        45, 46, 47, 48, 49]

In [24]:
from imblearn.over_sampling import SMOTENC

somtenc = SMOTENC(categorical_features=categorical_features,
                                   random_state=4)
X_resampled, y_resampled = somtenc.fit_resample(X_train, y_train)

print(somtenc.__class__.__name__, "\n")
print(sorted(Counter(y_resampled).items()))

SMOTENC 

[(0, 12845), (1, 12845)]


In [25]:
print(X_resampled[-5:])

       x2  x4  x5   x7  x8  x9  x10            x14     x15  x17  ...  x16_0  \
25685   0   0   2  100   1   0    0       0.000000  652050    0  ...      1   
25686   0   0   4  100   1   0    0  386894.414089       0    1  ...      0   
25687   0   0   1  100   1   0    0  169870.265312       0    1  ...      0   
25688   0   0   4  100   1   0    0       0.000000       0    0  ...      0   
25689   0   0   1  100   2   0    0       0.000000       0    1  ...      0   

       x16_1  x16_2  x16_3  x16_4  x16_5  x20_0  x20_1  x20_2  x20_3  
25685      0      0      0      0      0      0      1      0      0  
25686      1      0      0      0      0      0      1      0      0  
25687      0      0      0      0      0      1      0      0      0  
25688      0      0      0      0      0      1      0      0      0  
25689      0      0      0      0      0      1      0      0      0  

[5 rows x 50 columns]


In [26]:
y_resampled.value_counts()

0    12845
1    12845
Name: fraud_YN, dtype: int64

### 6. SVMSMOTE
: What special about Borderline-SMOTE SVM compared to the Borderline-SMOTE is that more data are synthesized away from the region of class overlap. It focuses more on where the data is separated.

In [27]:
from imblearn.over_sampling import SVMSMOTE

svmsmote = SVMSMOTE(random_state=4)
X_resampled, y_resampled = svmsmote.fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))

[(0, 12845), (1, 5990)]


### 7. SMOTEENN (combination(over-under))
: Over-sampling using SMOTE and cleaning using ENN.\
: Combine over- and under-sampling using SMOTE and Edited Nearest Neighbours.

In [28]:
from imblearn.combine import SMOTEENN

smoteenn = SMOTEENN(random_state=4)
X_resampled, y_resampled = smoteenn.fit_resample(X_train, y_train)

print(smoteenn.__class__.__name__, "\n")
print(sorted(Counter(y_resampled).items()))

SMOTEENN 

[(0, 7701), (1, 8239)]


In [30]:
print(X_resampled[-10:])

       x2  x4  x5   x7  x8  x9  x10           x14      x15  x17  ...  x16_0  \
15930   0   0   3  100   1   0    0  3.357163e+06   895703    0  ...      0   
15931   0   0   2  100   1   0    0  4.434463e+05   277638    0  ...      0   
15932   0   0   1  100   1   0    0  0.000000e+00   367130    0  ...      1   
15933   0   0   1  100   2   0    0  5.328603e+05   178244    0  ...      0   
15934   0   0   2  100   1   0    0  0.000000e+00        0    0  ...      0   
15935   0   0   2  100   1   0    0  0.000000e+00  1014717    0  ...      0   
15936   0   0   2  100   1   0    0  4.399815e+05   308866    0  ...      0   
15937   0   0   3  100   1   0    0  0.000000e+00   352657    0  ...      0   
15938   0   0   2  100   1   0    0  0.000000e+00        0    1  ...      1   
15939   0   0   1  100   1   0    0  0.000000e+00        0    1  ...      0   

       x16_1  x16_2  x16_3  x16_4  x16_5  x20_0  x20_1  x20_2  x20_3  
15930      1      0      0      0      0      0      0     

In [31]:
y_resampled.value_counts()

1    8239
0    7701
Name: fraud_YN, dtype: int64

### 8. SMOTETomek (combination(over-under))
: Over-sampling using SMOTE and cleaning using Tomek links.\
: Combine over- and under-sampling using SMOTE and Tomek links.

In [32]:
from imblearn.combine import SMOTETomek

smotetomek = SMOTETomek(random_state=4)
X_resampled, y_resampled = smotetomek.fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))

[(0, 11247), (1, 11247)]


In [33]:
y_resampled.value_counts()

0    11247
1    11247
Name: fraud_YN, dtype: int64