In [1]:
!pip install pyod -q
!pip install suod -q
!pip install --upgrade --force-reinstall xlrd -q

In [2]:
import pandas as pd
import numpy as np

from pyod.models.suod import SUOD
from pyod.models.ecod import ECOD
from pyod.models.lof import LOF
from pyod.models.iforest import IForest
from pyod.models.hbos import HBOS
from pyod.models.deep_svdd import DeepSVDD

from imblearn.combine import SMOTEENN
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

import gc

In [3]:
df = pd.read_excel('./drive/MyDrive/save/taiwan/default of credit card clients.xls', index_col = 0)  

In [4]:
df.rename({'default payment next month': 'target'}, axis=1, inplace=True)

In [5]:
y = df.target
df.drop('target', axis = 1, inplace = True)

In [6]:
detector_list = [ECOD(), LOF(), IForest(), HBOS(), DeepSVDD()]

# decide the number of parallel process, and the combination method
# then clf can be used as any outlier detection model
clf = SUOD(base_estimators=detector_list, n_jobs=-1, combination='maximization', contamination = 0.2, verbose=False)

In [7]:
df = df[clf.fit_predict(df) == 0]




[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:  3.8min remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:  3.8min finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   35.8s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   35.8s finished


In [8]:
df.shape

(24002, 23)

In [9]:
df = df.join(y)

In [10]:
y = df.target
df.drop('target', axis = 1, inplace = True)

In [11]:
cat_cols=['SEX','EDUCATION','MARRIAGE','PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6']

In [12]:
#One-Hot-Endcoding
df = pd.get_dummies(df, columns = cat_cols)

In [13]:
df.shape

(24002, 68)

In [14]:
#Normalize
scaler = StandardScaler()
X = scaler.fit_transform(df)

In [15]:
#Oversampling and undersampling
sm = SMOTEENN(random_state=42, n_jobs = -1)
X, y = sm.fit_resample(X, y)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=True, stratify = y)
print('Size of train dataframe: ', X_train.shape)
print('Size of train dataframe: ', y_train.shape)
print('Size of test dataframe: ', X_test.shape)
print('Size of train dataframe: ', y_test.shape)

Size of train dataframe:  (20259, 68)
Size of train dataframe:  (20259,)
Size of test dataframe:  (5065, 68)
Size of train dataframe:  (5065,)


In [17]:
X_train = pd.DataFrame(X_train, columns=df.columns)
X_test = pd.DataFrame(X_test, columns=df.columns)
y_train = pd.DataFrame(y_train, columns=['target'])
y_test = pd.DataFrame(y_test, columns=['target'])

In [18]:
gc.collect()

5700

In [19]:
X_train.to_pickle('./drive/MyDrive/save/taiwan/X_train.pkl')
X_test.to_pickle('./drive/MyDrive/save/taiwan/X_test.pkl')
y_train.to_pickle('./drive/MyDrive/save/taiwan/y_train.pkl')
y_test.to_pickle('./drive/MyDrive/save/taiwan/y_test.pkl')