# Library

In [None]:
# default setting
import os
import gc
import pickle
from tqdm import tqdm

# data tools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

# preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score
from imblearn.under_sampling import RandomUnderSampler

# model pred
import keras
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Dropout, LSTM
from keras.layers.merge import concatenate
import tensorflow as tf
from numba import cuda
from itertools import product

# model interpretor
import lime
import lime.lime_tabular

# Default setting, Data

In [None]:
os.chdir(r'E:\[03] 단기 작업\빅콘 테스트')

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        # 프로그램 시작시에 메모리 증가가 설정되어야만 합니다
        print(e)

In [None]:
full_data_train = pd.read_csv('full_data_train_2.csv')
full_data_test = pd.read_csv('full_data_test_2.csv')

# 나이 / 현재 기준 경과한 입사일으로 변경 

Train 열 생성.
나이 / 현재 기준 경과한 입사일 / 분석 제거하는 4개 열

In [None]:
# 나이로 변환.
age = 2022 - full_data_train.birth_year

# 입사한지 몇일 됬는지로 변환
employdate = dt.datetime.now() - pd.to_datetime(full_data_train.company_enter_month, format = '%Y%m')
employdate = employdate.apply(lambda x : x.days)

# df setting
full_data_train['employ_date'] = employdate
full_data_train['age'] = age
full_data_train = full_data_train.drop(['loanapply_insert_time', 'insert_time', 'company_enter_month', 'birth_year'], axis = 1)

train_appid = full_data_train.pop('application_id')
train_usrid = full_data_train.pop('user_id')
train_target = full_data_train.pop('is_applied')

del train_appid, train_usrid

Test 열 생성.
나이 / 현재 기준 경과한 입사일 / 분석 제거하는 4개 열

In [None]:
# 나이로 변환.
age = 2022 - full_data_test.birth_year

# 입사한지 몇일 됬는지로 변환
employdate = dt.datetime.now() - pd.to_datetime(full_data_test.company_enter_month, format = '%Y%m')
employdate = employdate.apply(lambda x : x.days)

# df setting
full_data_test['employ_date'] = employdate
full_data_test['age'] = age
full_data_test = full_data_test.drop(['loanapply_insert_time', 'insert_time', 'company_enter_month', 'birth_year'], axis = 1)

test_appid = full_data_test.pop('application_id')
test_usrid = full_data_test.pop('user_id')
test_target = full_data_test.pop('is_applied')

del employdate, age

# Label encoding

In [None]:
# feature names에 personal_rehabilitation_complete_yn, personal_rehabilitation_yn 넣을건지.
feature_names = full_data_train.columns

cat_feature_names = ['bank_id', 'product_id', 'gender', 'income_type', 'employment_type',
                     'houseown_type', 'purpose', 'latest_os', 'latest_version', 'personal_rehabilitation']
categorical_features = [idx for idx, val in enumerate(full_data_train.columns) if val in cat_feature_names]

num_features_names = [val for idx, val in enumerate(full_data_train.columns) if val not in cat_feature_names]
numeric_features = [idx for idx, val in enumerate(full_data_train.columns) if val not in cat_feature_names]

# 형 변환을 위한 훈련 샘플 추가
full_data = pd.concat([full_data_train, full_data_test], axis = 0)

# train 변환.
categorical_names = {}
for feature in categorical_features:
    le = LabelEncoder()
    le.fit(full_data.iloc[:, feature])
    full_data_train.iloc[:, feature] = le.transform(full_data_train.iloc[:, feature])
    full_data_test.iloc[:, feature] = le.transform(full_data_test.iloc[:, feature])
    categorical_names[feature] = le.classes_

del full_data, le

라벨 인코딩 결과확인.

In [None]:
full_data_train.head(20)

Unnamed: 0,bank_id,product_id,loan_limit,loan_rate,gender,credit_score,yearly_income,income_type,employment_type,houseown_type,...,OpenApp,UsePrepayCalc,StartLoanApply,ViewLoanApplyIntro,CompleteIDCertification,EndLoanApply,SignUp,UseDSRCalc,employ_date,age
0,52,157,20000000.0,16.5,1,540.0,32000000.0,0,3,2,...,2.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,3235,52.0
1,52,157,11000000.0,16.5,1,580.0,72000000.0,0,3,3,...,2.0,0.0,2.0,2.0,2.0,1.0,0.0,0.0,1166,45.0
2,10,77,3000000.0,20.0,1,580.0,72000000.0,0,3,3,...,2.0,0.0,2.0,2.0,2.0,1.0,0.0,0.0,1166,45.0
3,41,142,10000000.0,13.5,1,740.0,39000000.0,0,3,3,...,4.0,0.0,3.0,4.0,2.0,1.0,0.0,0.0,3875,39.0
4,24,111,22000000.0,15.9,1,740.0,39000000.0,0,3,3,...,4.0,0.0,3.0,4.0,2.0,1.0,0.0,0.0,3875,39.0
5,21,145,10000000.0,18.4,1,740.0,39000000.0,0,3,3,...,4.0,0.0,3.0,4.0,2.0,1.0,0.0,0.0,3875,39.0
6,43,4,3000000.0,14.8,1,740.0,39000000.0,0,3,3,...,4.0,0.0,3.0,4.0,2.0,1.0,0.0,0.0,3875,39.0
7,5,21,31000000.0,16.6,1,740.0,39000000.0,0,3,3,...,4.0,0.0,3.0,4.0,2.0,1.0,0.0,0.0,3875,39.0
8,51,123,3000000.0,6.2,1,740.0,39000000.0,0,3,3,...,4.0,0.0,3.0,4.0,2.0,1.0,0.0,0.0,3875,39.0
9,40,146,39000000.0,12.4,1,740.0,39000000.0,0,3,3,...,4.0,0.0,3.0,4.0,2.0,1.0,0.0,0.0,3875,39.0


In [None]:
full_data_test.head(20)

Unnamed: 0,bank_id,product_id,loan_limit,loan_rate,gender,credit_score,yearly_income,income_type,employment_type,houseown_type,...,OpenApp,UsePrepayCalc,StartLoanApply,ViewLoanApplyIntro,CompleteIDCertification,EndLoanApply,SignUp,UseDSRCalc,employ_date,age
0,6,127,42000000.0,13.6,1,620.0,24000000.0,0,3,2,...,26.0,0.0,33.0,7.0,7.0,32.0,0.0,0.0,251,26.0
1,24,111,24000000.0,17.9,1,620.0,24000000.0,0,3,2,...,26.0,0.0,33.0,7.0,7.0,32.0,0.0,0.0,251,26.0
2,1,3,24000000.0,18.5,1,620.0,24000000.0,0,3,2,...,26.0,0.0,33.0,7.0,7.0,32.0,0.0,0.0,251,26.0
3,3,184,29000000.0,10.8,1,620.0,24000000.0,0,3,2,...,26.0,0.0,33.0,7.0,7.0,32.0,0.0,0.0,251,26.0
4,10,77,5000000.0,16.4,1,620.0,24000000.0,0,3,2,...,26.0,0.0,33.0,7.0,7.0,32.0,0.0,0.0,251,26.0
5,34,110,21000000.0,15.2,1,620.0,24000000.0,0,3,2,...,26.0,0.0,33.0,7.0,7.0,32.0,0.0,0.0,251,26.0
6,43,4,3000000.0,14.8,1,620.0,24000000.0,0,3,2,...,26.0,0.0,33.0,7.0,7.0,32.0,0.0,0.0,251,26.0
7,27,143,10000000.0,18.0,1,620.0,24000000.0,0,3,2,...,26.0,0.0,33.0,7.0,7.0,32.0,0.0,0.0,251,26.0
8,46,118,31000000.0,13.0,1,620.0,24000000.0,0,3,2,...,26.0,0.0,33.0,7.0,7.0,32.0,0.0,0.0,251,26.0
9,10,112,50000000.0,13.3,1,620.0,24000000.0,0,3,2,...,26.0,0.0,33.0,7.0,7.0,32.0,0.0,0.0,251,26.0


In [None]:
categorical_names

{0: array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
        35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
        52, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64], dtype=int64),
 1: array([  1,   4,   5,   7,   8,  12,  13,  15,  16,  19,  20,  21,  22,
         24,  25,  26,  29,  30,  31,  33,  35,  36,  38,  39,  40,  42,
         43,  46,  47,  48,  49,  51,  52,  53,  55,  56,  57,  58,  59,
         60,  61,  62,  63,  64,  65,  67,  68,  70,  71,  75,  76,  78,
         79,  80,  81,  82,  85,  86,  90,  91,  92,  93,  94,  96,  98,
        100, 101, 102, 103, 105, 107, 108, 110, 111, 113, 114, 116, 118,
        119, 121, 123, 124, 126, 127, 128, 129, 130, 131, 134, 136, 137,
        138, 139, 140, 141, 142, 144, 146, 147, 148, 149, 150, 152, 157,
        159, 161, 162, 163, 164, 166, 168, 169, 170, 171, 174, 175, 176,
        178, 181, 183, 184, 185, 186, 

# StandardScalering

In [None]:
std_enc = StandardScaler()
full_data_train.loc[:, num_features_names] = std_enc.fit_transform(full_data_train.loc[:, num_features_names])
full_data_test.loc[:, num_features_names] = std_enc.transform(full_data_test.loc[:, num_features_names])

표준화 결과 확인.

In [None]:
full_data_train.head(20)

Unnamed: 0,bank_id,product_id,loan_limit,loan_rate,gender,credit_score,yearly_income,income_type,employment_type,houseown_type,...,OpenApp,UsePrepayCalc,StartLoanApply,ViewLoanApplyIntro,CompleteIDCertification,EndLoanApply,SignUp,UseDSRCalc,employ_date,age
0,52,157,-0.116924,0.960803,1,-1.617021,-0.120729,0,3,2,...,-0.244273,-0.05772,-0.395696,-0.437841,-0.392636,-0.395787,-0.116982,-0.067597,0.784325,1.325082
1,52,157,-0.520378,0.960803,1,-1.308054,0.182536,0,3,3,...,-0.244273,-0.05772,-0.266884,-0.266836,-0.159133,-0.296778,-0.116982,-0.067597,-0.263918,0.599754
2,10,77,-0.879003,1.861367,1,-1.308054,0.182536,0,3,3,...,-0.244273,-0.05772,-0.266884,-0.266836,-0.159133,-0.296778,-0.116982,-0.067597,-0.263918,0.599754
3,41,142,-0.565206,0.18889,1,-0.072183,-0.067658,0,3,3,...,-0.07519,-0.05772,-0.138072,0.075176,-0.159133,-0.296778,-0.116982,-0.067597,1.108576,-0.021955
4,24,111,-0.027268,0.80642,1,-0.072183,-0.067658,0,3,3,...,-0.07519,-0.05772,-0.138072,0.075176,-0.159133,-0.296778,-0.116982,-0.067597,1.108576,-0.021955
5,21,145,-0.565206,1.44968,1,-0.072183,-0.067658,0,3,3,...,-0.07519,-0.05772,-0.138072,0.075176,-0.159133,-0.296778,-0.116982,-0.067597,1.108576,-0.021955
6,43,4,-0.879003,0.523386,1,-0.072183,-0.067658,0,3,3,...,-0.07519,-0.05772,-0.138072,0.075176,-0.159133,-0.296778,-0.116982,-0.067597,1.108576,-0.021955
7,5,21,0.376185,0.986533,1,-0.072183,-0.067658,0,3,3,...,-0.07519,-0.05772,-0.138072,0.075176,-0.159133,-0.296778,-0.116982,-0.067597,1.108576,-0.021955
8,51,123,-0.879003,-1.68943,1,-0.072183,-0.067658,0,3,3,...,-0.07519,-0.05772,-0.138072,0.075176,-0.159133,-0.296778,-0.116982,-0.067597,1.108576,-0.021955
9,40,146,0.734811,-0.094144,1,-0.072183,-0.067658,0,3,3,...,-0.07519,-0.05772,-0.138072,0.075176,-0.159133,-0.296778,-0.116982,-0.067597,1.108576,-0.021955


In [None]:
full_data_test.head(20)

Unnamed: 0,bank_id,product_id,loan_limit,loan_rate,gender,credit_score,yearly_income,income_type,employment_type,houseown_type,...,OpenApp,UsePrepayCalc,StartLoanApply,ViewLoanApplyIntro,CompleteIDCertification,EndLoanApply,SignUp,UseDSRCalc,employ_date,age
0,6,127,0.869295,0.214621,1,-0.999086,-0.181382,0,3,2,...,1.784715,-0.05772,3.726276,0.588193,1.008384,2.772475,-0.116982,-0.067597,-0.727495,-1.368993
1,24,111,0.062388,1.321028,1,-0.999086,-0.181382,0,3,2,...,1.784715,-0.05772,3.726276,0.588193,1.008384,2.772475,-0.116982,-0.067597,-0.727495,-1.368993
2,1,3,0.062388,1.475411,1,-0.999086,-0.181382,0,3,2,...,1.784715,-0.05772,3.726276,0.588193,1.008384,2.772475,-0.116982,-0.067597,-0.727495,-1.368993
3,3,184,0.286529,-0.505831,1,-0.999086,-0.181382,0,3,2,...,1.784715,-0.05772,3.726276,0.588193,1.008384,2.772475,-0.116982,-0.067597,-0.727495,-1.368993
4,10,77,-0.789347,0.935072,1,-0.999086,-0.181382,0,3,2,...,1.784715,-0.05772,3.726276,0.588193,1.008384,2.772475,-0.116982,-0.067597,-0.727495,-1.368993
5,34,110,-0.072096,0.626307,1,-0.999086,-0.181382,0,3,2,...,1.784715,-0.05772,3.726276,0.588193,1.008384,2.772475,-0.116982,-0.067597,-0.727495,-1.368993
6,43,4,-0.879003,0.523386,1,-0.999086,-0.181382,0,3,2,...,1.784715,-0.05772,3.726276,0.588193,1.008384,2.772475,-0.116982,-0.067597,-0.727495,-1.368993
7,27,143,-0.565206,1.346759,1,-0.999086,-0.181382,0,3,2,...,1.784715,-0.05772,3.726276,0.588193,1.008384,2.772475,-0.116982,-0.067597,-0.727495,-1.368993
8,46,118,0.376185,0.060238,1,-0.999086,-0.181382,0,3,2,...,1.784715,-0.05772,3.726276,0.588193,1.008384,2.772475,-0.116982,-0.067597,-0.727495,-1.368993
9,10,112,1.22792,0.137429,1,-0.999086,-0.181382,0,3,2,...,1.784715,-0.05772,3.726276,0.588193,1.008384,2.772475,-0.116982,-0.067597,-0.727495,-1.368993


# Onehot encoding 

In [None]:
# 형 변환을 위한 훈련 샘플 (라벨 인코딩 변현됬으니 새로 생성.)
full_data = pd.concat([full_data_train, full_data_test], axis = 0)

# 여기서 좀 문제가 있는데.... onehot때문에 항상 full_data에서 훈련해야한다.
encoder = ColumnTransformer([("enc", OneHotEncoder(), categorical_features)], remainder = 'passthrough')
encoder.fit(full_data)

del full_data

In [None]:
train_target

0           1.0
1           0.0
2           0.0
3           0.0
4           0.0
           ... 
10264381    0.0
10264382    0.0
10264383    0.0
10264384    0.0
10264385    0.0
Name: is_applied, Length: 10264386, dtype: float64

y_target값은 undersampling 과정에서 onehot이 풀리기 때문에, 훈련만 시켜주도록 하겠다.

In [None]:
y_encoder = OneHotEncoder(sparse = False)
y_encoder.fit(train_target.to_frame())
# test_target값은 전부 na로 존재한다.

# Random undersampling

In [None]:
# 안되면 tomeklinks 기법을 알아보기. 결정경계값은 버리는 방식의 ... 검색해서 해결할것.

X_train, X_valid, y_train, y_valid = [],[],[],[]
for i in tqdm(range(5)):
    rus = RandomUnderSampler()
    undersampled_data, undersampled_target = rus.fit_resample(full_data_train, train_target)

    X_t, X_v, y_t, y_v = train_test_split(undersampled_data, undersampled_target, test_size=0.2, shuffle= True, stratify = undersampled_target)
    X_train.append(encoder.transform(X_t).toarray())
    X_valid.append(encoder.transform(X_v).toarray())
    y_train.append(y_encoder.transform(y_t.to_frame()))
    y_valid.append(y_encoder.transform(y_v.to_frame()))

# 5개의 훈련 샘플 리스트가 완성이 됨.
del undersampled_data, undersampled_target, X_t, X_v, y_t, y_v

100%|██████████| 5/5 [00:46<00:00,  9.29s/it]


In [None]:
X_train[0].shape, X_valid[0].shape, y_train[0].shape, y_valid[0].shape

((887537, 477), (221885, 477), (887537, 2), (221885, 2))

In [None]:
X_train[1].shape, X_valid[1].shape, y_train[1].shape, y_valid[1].shape

((887537, 477), (221885, 477), (887537, 2), (221885, 2))

In [None]:
X_train[2].shape, X_valid[2].shape, y_train[2].shape, y_valid[2].shape

((887537, 477), (221885, 477), (887537, 2), (221885, 2))

In [None]:
X_train[3].shape, X_valid[3].shape, y_train[3].shape, y_valid[3].shape

((887537, 477), (221885, 477), (887537, 2), (221885, 2))

In [None]:
X_train[4].shape, X_valid[4].shape, y_train[4].shape, y_valid[4].shape

((887537, 477), (221885, 477), (887537, 2), (221885, 2))

In [None]:
X_train[1]

array([[ 1.        ,  0.        ,  0.        , ..., -0.06759738,
        -0.43364282, -1.16175605],
       [ 0.        ,  0.        ,  0.        , ..., -0.06759738,
         1.30819307, -0.43642832],
       [ 0.        ,  0.        ,  0.        , ..., -0.06759738,
        -0.63427313,  0.59975415],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.06759738,
        -0.69608348, -1.88708378],
       [ 0.        ,  0.        ,  0.        , ..., -0.06759738,
        -0.74168128,  1.63593663],
       [ 0.        ,  0.        ,  0.        , ..., -0.06759738,
        -0.55675687, -0.33281007]])

In [None]:
X_train[0]

array([[ 0.        ,  0.        ,  0.        , ..., -0.06759738,
         0.38306441,  1.94679137],
       [ 0.        ,  0.        ,  0.        , ..., -0.06759738,
         3.34388144,  0.80699065],
       [ 0.        ,  0.        ,  0.        , ..., -0.06759738,
        -0.75738718, -0.95451956],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.06759738,
        -0.60336796, -0.43642832],
       [ 0.        ,  0.        ,  0.        , ..., -0.06759738,
        -0.34143394, -1.2653743 ],
       [ 0.        ,  0.        ,  0.        , ..., -0.06759738,
         1.32389898,  0.18528117]])

In [None]:
y_train[0]

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [None]:
y_train[1]

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [0., 1.],
       [1., 0.],
       [1., 0.]])

# 생성한 데이터 저장

model 구성.
onehot 실시 안하면 5개까지 돌아감 -> val_acc 78~79%정도.
tf.dataset 이용해서 데이터셋 구분해야할것 같은데

대용량 훈련 데이터 처리방안.
https://jins-sw.tistory.com/14

일단 램에 있는 작업 내역들 전부 끌어내리기 위해 저장을 실시한다.

In [None]:
if not os.path.exists('./train'): os.mkdir('./train')
if not os.path.exists('./valid'): os.mkdir('./valid')

for i in tqdm(range(5)):
    pd.DataFrame(np.concatenate([X_train[i], y_train[i]], axis = 1)).to_csv('./train/train_'+str(i)+'_samples.csv', index = False)
    gc.collect()
    pd.DataFrame(np.concatenate([X_valid[i], y_valid[i]], axis = 1)).to_csv('./valid/valid_'+str(i)+'_samples.csv', index = False)
    gc.collect()

del X_train, X_valid, y_train, y_valid

gc.collect()

100%|██████████| 5/5 [21:53<00:00, 262.74s/it]


0

# 저장한 데이터 불러오기

데이터를 저장해서 hdd에 넣은뒤, 불러오는 식으로 해결한다.

In [None]:
def get_data(path):
    for idx, line in enumerate(open(path)):
        if idx == 0:
            continue
        tokens = line.strip().split(',')
        features = [float(token) for token in tokens[:-2]]
        label = [float(token) for token in tokens[-2:]]

        yield (features, label)

이런식으로 작동하는 파일인것. next호출때마다 새로운 라인을 읽는 genorator 객체

genorator로 변환된 훈련 데이터셋 (tensor 인풋 위해서 변형)

# Tensor data set 생성


함수, X_train은 전부 float, y_train은 onehot이니 int, X_의 인풋사이즈는 [None]으로 들어온 만큼 반환, y는 onehot boolean값이니 2로 반환.

In [None]:
train_data = tf.data.Dataset.from_generator(get_data, (tf.float64, tf.float64),
                                         (tf.TensorShape([None]), tf.TensorShape([2])),
                                         args=('./train/train_0_samples.csv',))

In [None]:
valid_data = tf.data.Dataset.from_generator(get_data, (tf.float64, tf.float64),
                                            (tf.TensorShape([477]), tf.TensorShape([2])),
                                            args=('./valid/valid_0_samples.csv',))
# NONE도, 477도 가능. 왜냐하면 None은 들어온대로 반환하고 원래 477개니까.

In [None]:
train_size = 887537
valid_size = 221885
batch_size = 256

train_data = train_data.batch(batch_size).prefetch(1)
valid_data = valid_data.batch(batch_size).prefetch(1)

In [None]:
train_data

<PrefetchDataset shapes: ((None, None), (None, 2)), types: (tf.float64, tf.float64)>

일단 데이터셋 input 텐서 5개를 어떻게 genorator로 만들지 모르겠어서, 1변량 기준 구현.

# 모델 생성 후 학습

In [None]:
input0 = Input(shape = (477, ), name = 'input1') # X_train의 개수.
layer0 = Dense(512, activation='swish')(input0)
layer0 = Dense(256, activation='swish')(layer0)
layer0 = Dense(256, activation='swish')(layer0)
output0 = Dense(2, activation='softmax', name = 'output3')(layer0)

model_base = Model(inputs = [input0], outputs = [output0])
#model = Model(inputs = [input1, input2, input3], outputs = [output1, output2, output3])
model_base.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model_base.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input1 (InputLayer)         [(None, 477)]             0         
                                                                 
 dense (Dense)               (None, 512)               244736    
                                                                 
 dense_1 (Dense)             (None, 256)               131328    
                                                                 
 dense_2 (Dense)             (None, 256)               65792     
                                                                 
 output3 (Dense)             (None, 2)                 514       
                                                                 
Total params: 442,370
Trainable params: 442,370
Non-trainable params: 0
_________________________________________________________________


In [None]:
# model setting
early_stopping = keras.callbacks.EarlyStopping(patience = 20, restore_best_weights = True)
reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', factor=0.2, patience=8, min_lr = 1e-9)

In [None]:
model_base.fit(train_data, batch_size=batch_size, epochs=500, callbacks=[early_stopping, reduce_lr], validation_data=valid_data)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500


<keras.callbacks.History at 0x1fec4fc5e20>

모델 저장.

In [None]:
model_base.save('models/'+'base_model-[swish + softmax]-512+256+256'+'.h5')

전체 데이터셋 대상으로 테스트 시행

In [None]:
X_f_train = encoder.transform(full_data_train).toarray()
y_f_train = y_encoder.transform(train_target.to_frame())

In [None]:
# csr mtx는 이렇게 반환이 가능하다.
X_f_train.getrow(1).toarray()

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0. 

In [None]:
y_f_train[:5]

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [None]:
y_pred = []
for i in tqdm(range(10000)):
    y_pred.append(model_base.predict(X_f_train.getrow(i).toarray()))
y_pred = np.concatenate(y_pred, axis = 0)

100%|██████████| 10000/10000 [04:44<00:00, 35.18it/s]


In [None]:
y_pred

array([[0.07205309, 0.927947  ],
       [0.11755277, 0.88244724],
       [0.53988975, 0.46011022],
       ...,
       [0.35276636, 0.64723366],
       [0.9284599 , 0.07154015],
       [0.861413  , 0.13858701]], dtype=float32)

In [None]:
y_f_train[:10000]

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [None]:
np.argmax(y_pred, axis = 1)

array([1, 1, 0, ..., 1, 0, 0], dtype=int64)

In [None]:
from sklearn.metrics import f1_score
f1 = f1_score(np.argmax(y_f_train[:10000], axis = 1), np.argmax(y_pred, axis = 1))
print(f1)

0.29986091794158554


In [None]:
from sklearn.metrics import classification_report
print(classification_report(np.argmax(y_f_train[:10000], axis = 1), np.argmax(y_pred, axis = 1)))

              precision    recall  f1-score   support

           0       0.99      0.74      0.85      9386
           1       0.18      0.88      0.30       614

    accuracy                           0.75     10000
   macro avg       0.59      0.81      0.57     10000
weighted avg       0.94      0.75      0.81     10000

