### 라이브러리 호출

In [16]:
# 연산 처리  패키지
import pandas as pd
import numpy as np
import math

# 전처리 패키지
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE, SMOTENC

# 모델 패키지
from xgboost.sklearn import XGBClassifier # from xgboost import XGBClassifier

# 평가지표 패키지
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score, roc_curve, mean_squared_error, precision_score, recall_score

# 시각화 패키지
from matplotlib import pyplot as plt
import seaborn as sns

# 그래프 설정
%matplotlib inline

# 경고 메세지 무시
import warnings
warnings.filterwarnings('ignore')

### 파일 불러오기

In [17]:
# 코랩 mount
from google.colab import drive
drive.mount('/content/drive')

### dataframe 이름 넣기 ###
### 여기에서 데이터셋 수정 ###
data_name = "2122211"

# base preprocess 데이터
base = pd.read_csv("/content/drive/MyDrive/기계학습의이해/Dataset/base_process.csv", engine='python')

# 전처리 데이터셋
preprocess = pd.read_csv(f"/content/drive/MyDrive/기계학습의이해/Dataset/DF/{data_name}.csv", encoding = 'euc-kr', engine='python')

# 파생변수 데이터셋
accident_rate = pd.read_csv("/content/drive/MyDrive/기계학습의이해/Dataset/accident_rate.csv")
predicted = pd.read_csv("/content/drive/MyDrive/기계학습의이해/Dataset/predicted_result.csv", encoding = 'UTF8', engine='python')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### 모델 함수

In [18]:
# 로지스틱 회귀 함수
def xgb_classifier(X_train, X_test, y_train, y_test):

    # LogisticRegression 초기화
    model = XGBClassifier()

    # smote 전 data type 변환
    bool_mask = X_train.dtypes == np.bool_
    bool_cols = X_train.columns[bool_mask].tolist()
    bool_mask = X_test.dtypes == np.bool_
    bool_cols = X_test.columns[bool_mask].tolist()

    for col_name in bool_cols:
        X_train[col_name] = X_train[col_name].astype(int)
        X_test[col_name] = X_test[col_name].astype(int)

    # SMOTE oversampling
    smt = SMOTE(sampling_strategy = 'auto')
    X_train_sm, y_train_sm = smt.fit_resample(X_train, y_train)

    # 모델 학습 / 예측
    model.fit(X_train_sm, y_train_sm)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1] # 양성 클래스에 대한 확률 추출

    # 기본 평가지표
    auc = roc_auc_score(y_test, y_proba)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity = tn / (tn+fp)

    # 최적 threshold 값 & 그 때의 FPR 출력
    fper, tper, thresholds = roc_curve(y_test, y_proba)  # thresholds 별 fpr tpr 계산
    optimal_idx = np.argmax(tper - fper)                 # fpr, tpr 간 차이가 가장 클 때의 index 저장
    optimal_threshold = thresholds[optimal_idx]

    # 최적 threshold 평가지표
    y_optpred = (y_proba > thresholds[optimal_idx]).astype(int)

    opt_accuracy = accuracy_score(y_test, y_optpred)
    opt_precision = precision_score(y_test, y_optpred)
    opt_recall = recall_score(y_test, y_optpred)
    opt_f1 = f1_score(y_test, y_optpred, average='weighted')

    tn2, fp2, fn2, tp2 = confusion_matrix(y_test, y_optpred).ravel()
    opt_specificity = tn2 / (tn2 + fp2)


    return auc, accuracy, recall, precision, f1, specificity, opt_accuracy, opt_recall, opt_precision, opt_f1, opt_specificity

### numeric 추가

In [19]:
# X, y 설정
yn_y = base['사고유무']
yn_X = preprocess

In [20]:
yn_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187983 entries, 0 to 187982
Data columns (total 30 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   여성            187983 non-null  bool 
 1   외산            187983 non-null  bool 
 2   1인 및 지정1인     187983 non-null  bool 
 3   가족 및 지정1인     187983 non-null  bool 
 4   가족및형제자매한정     187983 non-null  bool 
 5   가족한정(형제자매제외)  187983 non-null  bool 
 6   기명피보험자1인한정    187983 non-null  bool 
 7   누구나(기본)       187983 non-null  bool 
 8   부부 및 지정1인     187983 non-null  bool 
 9   부부한정          187983 non-null  bool 
 10  임직원한정         187983 non-null  bool 
 11  미가입           187983 non-null  bool 
 12  연령대           187983 non-null  int64
 13  C             187983 non-null  bool 
 14  D             187983 non-null  bool 
 15  N             187983 non-null  bool 
 16  Z             187983 non-null  bool 
 17  차량경과년수        187983 non-null  int64
 18  차종            187983 non-null  int64
 19  마일

In [21]:
# 연령대 (a)
if (data_name[0] == '1'):
    yn_X['연령numeric'] = np.where(yn_X['30.0'] == True, 35,
                        np.where(yn_X['40.0'] == True, 44,
                        np.where(yn_X['50.0'] == True, 54,
                        np.where(yn_X['60.0'] == True, 64,
                        np.where(yn_X['20.0'] == True, 25,
                        np.where(yn_X['70.0'] == True, 73,
                        np.where(yn_X['80.0'] == True, 83,
                        np.where(yn_X['90.0'] == True, 93, 19)))))))) # 10대가 지워짐
elif(data_name[0] == '2'):
    yn_X['연령numeric'] = np.where(yn_X['연령대'] == 30, 35,
                            np.where(yn_X['연령대'] == 40, 44,
                            np.where(yn_X['연령대'] == 50, 54,
                            np.where(yn_X['연령대'] == 60, 64,
                            np.where(yn_X['연령대'] == 20, 25,
                            np.where(yn_X['연령대'] == 70, 73,
                            np.where(yn_X['연령대'] == 80, 83,
                            np.where(yn_X['연령대'] == 90, 93, 19))))))))

# NCR (b)
if (data_name[1] == '1'):
    yn_X['직전3년numeric'] = np.where(yn_X['N'] == True, 0,
                        np.where(yn_X['D'] == True, 1,
                        np.where(yn_X['C'] == True, 2,
                        np.where(yn_X['Z'] == True, 0.15, 3))))
elif(data_name[1] == '2'):
    yn_X['직전3년numeric'] = np.where(yn_X['직전3년간사고건수'] == 'N', 0,
                            np.where(yn_X['직전3년간사고건수'] == 'D', 1,
                            np.where(yn_X['직전3년간사고건수'] == 'C', 2,
                            np.where(yn_X['직전3년간사고건수'] == 'Z', 0.15, 3))))

# 차량경과년수 (c)
if (data_name[2] == '1'):
    yn_X['차량경과numeric'] = np.where(yn_X['10년이상'] == True, 12.86,
                        np.where(yn_X['5년이하'] == True, 3.16,
                        np.where(yn_X['10년이하'] == True, 7.8, 0)))
elif(data_name[2] == '2'):
    yn_X['차량경과numeric'] = np.where(yn_X['차량경과년수'] == 3, 12.86,
                            np.where(yn_X['차량경과년수'] == 1, 3.16,
                            np.where(yn_X['차량경과년수'] == 2, 7.8, 0)))

# 차종 (d)
if (data_name[3] == '1'):
    yn_X['차종numeric'] = np.where(yn_X['소형A'] == True, 998,
                        np.where(yn_X['소형B'] == True, 1500,
                        np.where(yn_X['중형'] == True, 1999,
                        np.where(yn_X['대형'] == True, 2740,
                        np.where(yn_X['다목적2종'] == True, 2080, 2080)))))  # 다목적1종 지워짐
elif(data_name[3] == '2'):
    yn_X['차종numeric'] = np.where(yn_X['차종'] == 0, 998,
                            np.where(yn_X['차종'] == 1, 1500,
                            np.where(yn_X['차종'] == 2, 1999,
                            np.where(yn_X['차종'] == 3, 2740,
                            np.where(yn_X['차종'] == 4, 2080,
                            np.where(yn_X['차종'] == 5, 2080, np.nan))))))

# 마일리지약정거리 (e)
if (data_name[4] == '1'):
    yn_X['마일리지numeric'] = np.where(yn_X['3000K'] == True, 3,
                        np.where(yn_X['5000K'] == True, 5,
                        np.where(yn_X['7000K'] == True, 7,
                        np.where(yn_X['10000K'] == True, 10,
                        np.where(yn_X['12000K'] == True, 12,
                        np.where(yn_X['15000K'] == True, 15, 0))))))
elif(data_name[4] == '2'):
    yn_X['마일리지numeric'] = np.where(yn_X['마일리지약정거리'] == 0, 3,
                            np.where(yn_X['마일리지약정거리'] == 1, 5,
                            np.where(yn_X['마일리지약정거리'] == 2, 7,
                            np.where(yn_X['마일리지약정거리'] == 3, 10,
                            np.where(yn_X['마일리지약정거리'] == 4, 12,
                            np.where(yn_X['마일리지약정거리'] == 5, 15, 0))))))


In [22]:
yn_X

Unnamed: 0,여성,외산,1인 및 지정1인,가족 및 지정1인,가족및형제자매한정,가족한정(형제자매제외),기명피보험자1인한정,누구나(기본),부부 및 지정1인,부부한정,...,7.0,8.0,1억이하,5천만원이하,미가입.1,연령numeric,직전3년numeric,차량경과numeric,차종numeric,마일리지numeric
0,True,False,False,False,False,True,False,False,False,False,...,False,True,False,True,False,19,2.0,7.80,1999.0,15
1,True,False,False,False,False,False,False,True,False,False,...,False,True,False,True,False,19,1.0,3.16,2080.0,0
2,True,False,False,False,False,True,False,False,False,False,...,False,True,False,False,True,19,1.0,12.86,1999.0,0
3,True,False,False,False,False,True,False,False,False,False,...,False,False,False,True,False,19,0.0,3.16,1500.0,15
4,True,False,False,False,False,True,False,False,False,False,...,False,False,False,True,False,19,0.0,3.16,1500.0,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187978,False,False,False,False,False,False,False,False,False,True,...,True,False,False,False,True,44,2.0,3.16,1500.0,15
187979,False,False,False,False,False,False,False,True,False,False,...,False,False,False,True,False,44,2.0,12.86,2080.0,15
187980,False,False,False,False,False,True,False,False,False,False,...,True,False,False,True,False,44,2.0,3.16,2740.0,0
187981,False,False,False,False,False,False,False,True,False,False,...,False,True,False,False,True,44,2.0,12.86,1999.0,7


### numeric 모델 실행

In [None]:
data = []
auc = []
acc = []
prec = []
rec = []
f1 = []
spec = []

optacc = []
optprec = []
optrec = []
optspec = []
optf1 = []

for i in range(30):

    # y 범주 비율에 맞춰 train-test split
    X_tr, X_tst, y_tr, y_test = train_test_split(yn_X, yn_y, test_size=0.2, stratify=yn_y)

    # random sampling index 저장
    X_tr_rd = X_tr.sample(n=8000)
    y_tr_rd = y_tr[X_tr_rd.index]

    count = 0
    for a in range(1,3):
        for b in range(1,3):
            for c in range(1,3):
                for d in range(1,3):
                    for e in range(1,3):

                        # numeric 추가 여부 표기, 1이면 추가하지 않는 경우, 2면 추가하는 경우
                        drop_list = []
                        name = str(a) + str(b) + str(c) + str(d) + str(e)

                        # drop할 column과 dataset 이름 설정
                        if (a==1):
                            drop_list.append('연령numeric')
                        if (b==1):
                            drop_list.append('직전3년numeric')
                        if (c==1):
                            drop_list.append('차량경과numeric')
                        if (d==1):
                            drop_list.append('차종numeric')
                        if (e==1):
                            drop_list.append('마일리지numeric')

                        # 데이터셋 X, y 설정
                        if(len(drop_list) != 0):
                            X_train = X_tr_rd.drop(columns = drop_list, axis = 1)
                            X_test = X_tst.drop(columns = drop_list, axis = 1)
                        else:
                            X_train = X_tr_rd
                            X_test = X_tst

                        y_train = y_tr_rd

                        # 모델 실행
                        aucc, accuracy, recall, precision, f1_sc, specificity, opt_accuracy, opt_recall, opt_precision, opt_f1_sc, opt_specificity = xgb_classifier(X_train, X_test, y_train, y_test)
                        count += 1

                        # 결과 저장 & csv 저장
                        data.append(name)
                        auc.append(aucc)
                        acc.append(accuracy)
                        prec.append(precision)
                        rec.append(recall)
                        f1.append(f1_sc)
                        spec.append(specificity)

                        optacc.append(opt_accuracy)
                        optprec.append(opt_precision)
                        optrec.append(opt_recall)
                        optf1.append(opt_f1_sc)
                        optspec.append(opt_specificity)

    # 진행상황
    print("========== ", (i+1) / 30 * 100 , "% 완료 ===========")


# 결과 DataFrame 생성
results = pd.DataFrame({
    'numeric_name' : data,
    'auc' : auc,
    'accuracy' : acc,
    'precision' : prec,
    'recall' : rec,
    "f1-score" : f1,
    "specificity" : spec,
    'opt_accuracy' : optacc,
    'opt_precision' : optprec,
    'opt_recall' : optrec,
    "opt_f1-score" : optf1,
    "opt_specificity" : optspec
})

results = results.sort_values(by=["numeric_name"])

# csv 저장
results.to_csv(f"/content/drive/MyDrive/기계학습의이해/XGB/XGB최종/결과/XGB_{data_name}_numeric(6_7).csv", index = False)



### numeric 랭킹

In [None]:
numeric_mean = results.groupby("numeric_name").mean().reset_index()
numeric_mean.sort_values(by='auc', ascending=False).head(10)

Unnamed: 0,numeric_name,auc,accuracy,precision,recall,f1-score,specificity,opt_accuracy,opt_precision,opt_recall,opt_f1-score,opt_specificity
21,21212,0.560954,0.815312,0.185012,0.096828,0.789825,0.931201,0.52483,0.161418,0.575342,0.594296,0.516683
5,11212,0.560823,0.815888,0.186862,0.097204,0.790232,0.93181,0.519299,0.160867,0.581903,0.589378,0.509201
17,21112,0.560278,0.804949,0.184991,0.118671,0.786392,0.915644,0.524242,0.160717,0.573586,0.594418,0.516283
23,21222,0.560093,0.82578,0.186794,0.075814,0.792946,0.946748,0.515358,0.160611,0.587163,0.585512,0.503776
22,21221,0.560042,0.817855,0.184312,0.090885,0.790511,0.935114,0.536066,0.162067,0.557845,0.60427,0.532553
4,11211,0.560013,0.806647,0.182829,0.113028,0.786711,0.918526,0.526339,0.161002,0.570375,0.59566,0.519236
1,11112,0.559978,0.804894,0.185825,0.119737,0.786489,0.915408,0.523169,0.160719,0.574346,0.5923,0.514915
20,21211,0.559766,0.807069,0.18437,0.113705,0.787061,0.918907,0.524015,0.160966,0.572833,0.592632,0.516141
19,21122,0.559563,0.815105,0.185107,0.097313,0.78977,0.930883,0.542184,0.161987,0.546974,0.609215,0.541412
7,11222,0.559556,0.825577,0.186529,0.076082,0.792872,0.946469,0.515544,0.160523,0.585599,0.584504,0.504244


### numeric 변수 결론

1트 : 21212

2트 : 21112

3트 : 11212 / 21212 / 21112

4트 : 11122 / 21112 / 21212

5트 : 21212 / 11212 / 21112

In [23]:
# ranking 결과 데이터셋 이름 가져오기
# numeric_name = ranking.iloc[0, 0]

### 직접 입력
numeric_name = str(21212)

# 데이터셋 생성
drop_list = []
name = ""

# drop할 column과 dataset 이름 설정
if (numeric_name[0] == '1'):
    drop_list.append('연령numeric')
if (numeric_name[1] == '1'):
    drop_list.append('직전3년numeric')
if (numeric_name[2] == '1'):
    drop_list.append('차량경과numeric')
if (numeric_name[3] == '1'):
    drop_list.append('차종numeric')
if (numeric_name[4] == '1'):
    drop_list.append('마일리지numeric')


# X, y 설정
if(len(drop_list) != 0):
    yn_X = yn_X.drop(columns = drop_list, axis = 1)

yn_X.head(3)

Unnamed: 0,여성,외산,1인 및 지정1인,가족 및 지정1인,가족및형제자매한정,가족한정(형제자매제외),기명피보험자1인한정,누구나(기본),부부 및 지정1인,부부한정,...,5.0,6.0,7.0,8.0,1억이하,5천만원이하,미가입.1,연령numeric,차량경과numeric,마일리지numeric
0,True,False,False,False,False,True,False,False,False,False,...,False,False,False,True,False,True,False,19,7.8,15
1,True,False,False,False,False,False,False,True,False,False,...,False,False,False,True,False,True,False,19,3.16,0
2,True,False,False,False,False,True,False,False,False,False,...,False,False,False,True,False,False,True,19,12.86,0


### 전년도 사고율 추가

In [24]:
accident_rate

Unnamed: 0,10대,20대,30대,40대,50대,60대,70대,80대,90대,여성,남성,소형,중형,대형,미가입,가족,부부,기명피보험자1인,기타
0,1.21,0.55,0.47,0.48,0.64,0.75,0.79,0.79,0.79,0.155036,0.129267,1.09,1.35,1.5,0.12,0.16,0.11,0.12,0.14


In [25]:
# 전년도 사고율

# 연령대
if (data_name[0] == '1'):
    yn_X['연령대전년도사고율'] = np.where(yn_X['30.0'] == True, accident_rate['30대'],
                        np.where(yn_X['40.0'] == True, accident_rate['40대'],
                        np.where(yn_X['50.0'] == True, accident_rate['50대'],
                        np.where(yn_X['60.0'] == True, accident_rate['60대'],
                        np.where(yn_X['20.0'] == True, accident_rate['20대'],
                        np.where(yn_X['70.0'] == True, accident_rate['70대'],
                        np.where(yn_X['80.0'] == True, accident_rate['70대'],
                        np.where(yn_X['90.0'] == True, accident_rate['70대'], accident_rate['10대'])))))))) # 10대가 지워짐
elif(data_name[0] == '2'):
    yn_X['연령대전년도사고율'] = np.where(yn_X['연령대'] == 30, accident_rate['30대'],
                                np.where(yn_X['연령대'] == 40, accident_rate['40대'],
                                np.where(yn_X['연령대'] == 50, accident_rate['50대'],
                                np.where(yn_X['연령대'] == 60, accident_rate['60대'],
                                np.where(yn_X['연령대'] == 20, accident_rate['20대'],
                                np.where(yn_X['연령대'] >= 70, accident_rate['70대'],
                                np.where(yn_X['연령대'] == 10, accident_rate['10대'],np.nan)))))))

# 성별 (원핫 고정)
yn_X['성별전년도사고율'] = np.where(yn_X['여성'] == True, accident_rate['여성'], accident_rate['남성']) # 남성이 지워짐

# 차종
if (data_name[3] == '1'):
    yn_X['차종전년도사고율'] = np.where(yn_X['중형'] == True, accident_rate['중형'],
                        np.where(yn_X['대형'] == True, accident_rate['대형'],
                        np.where(yn_X['소형B'] == True, accident_rate['소형'],
                        np.where(yn_X['다목적2종'] == True, accident_rate['대형'],
                        np.where(yn_X['소형A'] == True, accident_rate['소형'],
                        accident_rate['대형']))))) # 다목적1종이 지워짐
elif(data_name[3] == '2'):
    yn_X['차종전년도사고율'] = np.where(yn_X['차종'] == 2, accident_rate['중형'],
                            np.where(yn_X['차종'] == 3, accident_rate['대형'],
                            np.where(yn_X['차종'] == 1, accident_rate['소형'],
                            np.where(yn_X['차종'] == 5, accident_rate['대형'],
                            np.where(yn_X['차종'] == 0, accident_rate['소형'],
                            np.where(yn_X['차종'] == 4, accident_rate['대형'], np.nan))))))

# 특약 (원핫 고정)
yn_X['특약전년도사고율'] = np.where((yn_X['기명피보험자1인한정'] == True) | (yn_X['1인 및 지정1인'] == True), accident_rate['기명피보험자1인'],
                        np.where((yn_X['부부한정'] == True) | (yn_X['부부 및 지정1인'] == True), accident_rate['부부'],
                        np.where((yn_X['가족한정(형제자매제외)'] == True) | (yn_X['가족및형제자매한정'] == True) | (yn_X['가족 및 지정1인'] == True), accident_rate['가족'],
                        np.where(yn_X['누구나(기본)'] == True, accident_rate['미가입'],
                        np.where(yn_X['임직원한정'] == True, accident_rate['기타'], accident_rate['부부']))))) # 부부 및 자녀한정이 지워짐


In [26]:
yn_X.head(3)

Unnamed: 0,여성,외산,1인 및 지정1인,가족 및 지정1인,가족및형제자매한정,가족한정(형제자매제외),기명피보험자1인한정,누구나(기본),부부 및 지정1인,부부한정,...,1억이하,5천만원이하,미가입.1,연령numeric,차량경과numeric,마일리지numeric,연령대전년도사고율,성별전년도사고율,차종전년도사고율,특약전년도사고율
0,True,False,False,False,False,True,False,False,False,False,...,False,True,False,19,7.8,15,1.21,0.155036,1.35,0.16
1,True,False,False,False,False,False,False,True,False,False,...,False,True,False,19,3.16,0,1.21,0.155036,1.5,0.12
2,True,False,False,False,False,True,False,False,False,False,...,False,False,True,19,12.86,0,1.21,0.155036,1.35,0.16


### 전년도 사고율 모델 실행

In [27]:
data = []
auc = []
acc = []
prec = []
rec = []
f1 = []
spec = []

optacc = []
optprec = []
optrec = []
optspec = []
optf1 = []


for i in range(30):

    # y 범주 비율에 맞춰 train-test split
    X_tr, X_tst, y_tr, y_test = train_test_split(yn_X, yn_y, test_size=0.2, stratify=yn_y)

    # random sampling index 저장
    X_tr_rd = X_tr.sample(n=8000)
    y_tr_rd = y_tr[X_tr_rd.index]

    count = 0
    for a in range(1,3):
        for b in range(1,3):
            for c in range(1,3):
                for d in range(1,3):

                    # numeric 추가 여부 표기, 1이면 추가하지 않는 경우, 2면 추가하는 경우
                    drop_list = []
                    name = str(a) + str(b) + str(c) + str(d)

                    # drop할 column과 dataset 이름 설정
                    if (a==1):
                        drop_list.append('연령대전년도사고율')
                    if (b==1):
                        drop_list.append('성별전년도사고율')
                    if (c==1):
                        drop_list.append('차종전년도사고율')
                    if (d==1):
                        drop_list.append('특약전년도사고율')

                    # 데이터셋 X, y 설정
                    if(len(drop_list) != 0):
                        X_train = X_tr_rd.drop(columns = drop_list, axis = 1)
                        X_test = X_tst.drop(columns = drop_list, axis = 1)
                    else:
                        X_train = X_tr_rd
                        X_test = X_tst

                    y_train = y_tr_rd

                    # 모델 실행
                    aucc, accuracy, recall, precision, f1_sc, specificity, opt_accuracy, opt_recall, opt_precision, opt_f1_sc, opt_specificity = xgb_classifier(X_train, X_test, y_train, y_test)
                    count += 1

                    # 결과 저장 & csv 저장
                    data.append(name)
                    auc.append(aucc)
                    acc.append(accuracy)
                    prec.append(precision)
                    rec.append(recall)
                    f1.append(f1_sc)
                    spec.append(specificity)

                    optacc.append(opt_accuracy)
                    optprec.append(opt_precision)
                    optrec.append(opt_recall)
                    optf1.append(opt_f1_sc)
                    optspec.append(opt_specificity)

                    # # 진행상황 출력
                    # if (count % 4 == 0) :
                    #     print(f"{(count/16) * 100}% 완료")

    # 진행상황
    print("========== ", (i+1) / 30 * 100, "% 완료 ===========")


# 결과 DataFrame 생성
results = pd.DataFrame({
    'plus_name' : data,
    'auc' : auc,
    'accuracy' : acc,
    'precision' : prec,
    'recall' : rec,
    "f1-score" : f1,
    "specificity" : spec,
    'opt_accuracy' : optacc,
    'opt_precision' : optprec,
    'opt_recall' : optrec,
    "opt_f1-score" : optf1,
    "opt_specificity" : optspec
})

results = results.sort_values(by=["plus_name"])

# csv 저장
results.to_csv(f"/content/drive/MyDrive/기계학습의이해/XGB/XGB최종/결과/XGB_{data_name}_plus(6_7).csv", index = False)



### 전년도 사고율 랭킹

In [28]:
plus_mean = results.groupby("plus_name").mean().reset_index()
plus_mean.sort_values(by='auc', ascending=False).head(10)

Unnamed: 0,plus_name,auc,accuracy,precision,recall,f1-score,specificity,opt_accuracy,opt_precision,opt_recall,opt_f1-score,opt_specificity
1,1112,0.559206,0.82898,0.19138,0.071716,0.79417,0.951125,0.525483,0.160567,0.569054,0.594802,0.518456
7,1222,0.559076,0.838812,0.189207,0.048889,0.795834,0.966225,0.518637,0.160277,0.579395,0.588146,0.508837
9,2112,0.558988,0.82908,0.188626,0.069865,0.79392,0.95154,0.524318,0.160348,0.570273,0.593847,0.516906
13,2212,0.558632,0.834224,0.187202,0.057896,0.794857,0.959444,0.528813,0.160798,0.565013,0.597893,0.522973
5,1212,0.558591,0.834513,0.187924,0.057679,0.794984,0.959815,0.521641,0.160367,0.575284,0.591797,0.512988
4,1211,0.558318,0.823629,0.183534,0.078208,0.792036,0.943863,0.543916,0.161794,0.54393,0.611607,0.543914
11,2122,0.558273,0.834873,0.187829,0.056875,0.795048,0.960362,0.51978,0.160255,0.576484,0.588912,0.510634
3,1122,0.557874,0.8348,0.186124,0.056249,0.794891,0.960378,0.526641,0.160409,0.566156,0.595418,0.520268
15,2222,0.557874,0.838362,0.184553,0.047894,0.795389,0.965863,0.532585,0.160801,0.559051,0.602087,0.528316
0,1111,0.557839,0.816012,0.183337,0.094,0.789836,0.932471,0.525569,0.160277,0.567311,0.594528,0.518837


### 전년도 사고율 변수 결론

1트 : 2112

2트 : 1112

3트 : 1112

4트 : 2212

5트 : 1112



In [29]:
# ranking 결과 데이터셋 이름 가져오기
# plus_name = ranking.iloc[0, 0]

### 직접입력
plus_name = str(1112)

# 데이터셋 생성
drop_list = []

# drop할 column과 dataset 이름 설정
if (plus_name[0] == '1'):
    drop_list.append('연령대전년도사고율')
if (plus_name[1] == '1'):
    drop_list.append('성별전년도사고율')
if (plus_name[2] == '1'):
    drop_list.append('차종전년도사고율')
if (plus_name[3] == '1'):
    drop_list.append('특약전년도사고율')


# X, y 설정
if(len(drop_list) != 0):
    yn_X = yn_X.drop(columns = drop_list, axis = 1)

yn_X.head(3)

Unnamed: 0,여성,외산,1인 및 지정1인,가족 및 지정1인,가족및형제자매한정,가족한정(형제자매제외),기명피보험자1인한정,누구나(기본),부부 및 지정1인,부부한정,...,6.0,7.0,8.0,1억이하,5천만원이하,미가입.1,연령numeric,차량경과numeric,마일리지numeric,특약전년도사고율
0,True,False,False,False,False,True,False,False,False,False,...,False,False,True,False,True,False,19,7.8,15,0.16
1,True,False,False,False,False,False,False,True,False,False,...,False,False,True,False,True,False,19,3.16,0,0.12
2,True,False,False,False,False,True,False,False,False,False,...,False,False,True,False,False,True,19,12.86,0,0.16


### 예측 사고율 추가

In [30]:
predicted

Unnamed: 0,20대,30대,40대,50대,~64,65~,남성,여성,특약부부,특약 미가입,특약 기타,특약 기명피보험자1인,특약 가족,남성 TAAS,여성 TAAS
0,0.00551,0.003748,0.033873,0.004499,0.022487,0.010354,0.04021,0.025433,0.043751,0.055043,0.06511,0.045383,0.087615,0.008801,0.003425


In [31]:
# 예측 사고율
# 연령대
if (data_name[0] == '1'):
    yn_X['연령대예측사고율'] = np.where(yn_X['30.0'] == True, predicted['30대'],
                        np.where(yn_X['40.0'] == True, predicted['40대'],
                        np.where(yn_X['50.0'] == True, predicted['50대'],
                        np.where(yn_X['60.0'] == True, predicted['~64'],
                        np.where(yn_X['20.0'] == True, predicted['20대'],
                        np.where(yn_X['70.0'] == True, predicted['65~'],
                        np.where(yn_X['80.0'] == True, predicted['65~'],
                        np.where(yn_X['90.0'] == True, predicted['65~'], predicted['20대'])))))))) # 10대가 지워짐
elif(data_name[0] == '2'):
    yn_X['연령대예측사고율'] = np.where(yn_X['연령대'] == 30, predicted['30대'],
                            np.where(yn_X['연령대'] == 40, predicted['40대'],
                            np.where(yn_X['연령대'] == 50, predicted['50대'],
                            np.where(yn_X['연령대'] == 60, predicted['~64'],
                            np.where(yn_X['연령대'] <= 20, predicted['20대'],
                            np.where(yn_X['연령대'] >= 70, predicted['65~'], np.nan))))))

# 성별 taas
yn_X['TAAS성별예측사고율'] = np.where(yn_X['여성'] == False, predicted['남성 TAAS'], predicted['여성 TAAS'])

# 성별
yn_X['성별예측사고율'] = np.where(yn_X['여성'] == False, predicted['남성'], predicted['여성'])

# 특약
yn_X['특약예측사고율'] = np.where((yn_X['기명피보험자1인한정'] == True) | (yn_X['1인 및 지정1인'] == True), predicted['특약 기명피보험자1인'],
                            np.where((yn_X['부부한정'] == True) | (yn_X['부부 및 지정1인'] == True), predicted['특약부부'],
                            np.where((yn_X['가족한정(형제자매제외)'] == True) | (yn_X['가족및형제자매한정'] == True) | (yn_X['가족 및 지정1인'] == True), predicted['특약 가족'],
                            np.where(yn_X['누구나(기본)'] == True, predicted['특약 미가입'],
                            np.where(yn_X['임직원한정'] == True, predicted['특약 기타'], predicted['특약부부']))))) # 부부및자녀한정이지워짐

In [32]:
yn_X

Unnamed: 0,여성,외산,1인 및 지정1인,가족 및 지정1인,가족및형제자매한정,가족한정(형제자매제외),기명피보험자1인한정,누구나(기본),부부 및 지정1인,부부한정,...,5천만원이하,미가입.1,연령numeric,차량경과numeric,마일리지numeric,특약전년도사고율,연령대예측사고율,TAAS성별예측사고율,성별예측사고율,특약예측사고율
0,True,False,False,False,False,True,False,False,False,False,...,True,False,19,7.80,15,0.16,0.005510,0.003425,0.025433,0.087615
1,True,False,False,False,False,False,False,True,False,False,...,True,False,19,3.16,0,0.12,0.005510,0.003425,0.025433,0.055043
2,True,False,False,False,False,True,False,False,False,False,...,False,True,19,12.86,0,0.16,0.005510,0.003425,0.025433,0.087615
3,True,False,False,False,False,True,False,False,False,False,...,True,False,19,3.16,15,0.16,0.005510,0.003425,0.025433,0.087615
4,True,False,False,False,False,True,False,False,False,False,...,True,False,19,3.16,15,0.16,0.005510,0.003425,0.025433,0.087615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187978,False,False,False,False,False,False,False,False,False,True,...,False,True,44,3.16,15,0.11,0.033873,0.008801,0.040210,0.043751
187979,False,False,False,False,False,False,False,True,False,False,...,True,False,44,12.86,15,0.12,0.033873,0.008801,0.040210,0.055043
187980,False,False,False,False,False,True,False,False,False,False,...,True,False,44,3.16,0,0.16,0.033873,0.008801,0.040210,0.087615
187981,False,False,False,False,False,False,False,True,False,False,...,False,True,44,12.86,7,0.12,0.033873,0.008801,0.040210,0.055043


### 예측 사고율 모델 실행

In [48]:
data = []
auc = []
acc = []
prec = []
rec = []
f1 = []
spec = []

optacc = []
optprec = []
optrec = []
optspec = []
optf1 = []

for i in range(30):

    # y 범주 비율에 맞춰 train-test split
    X_tr, X_tst, y_tr, y_test = train_test_split(yn_X, yn_y, test_size=0.2, stratify=yn_y)

    # random sampling index 저장
    X_tr_rd = X_tr.sample(n=8000)
    y_tr_rd = y_tr[X_tr_rd.index]

    count = 0
    for a in range(1,3):
        for b in range(1,3):
            for c in range(1,3):
                for d in range(1,3):

                    # numeric 추가 여부 표기, 1이면 추가하지 않는 경우, 2면 추가하는 경우
                    drop_list = []
                    name = str(a) + str(b) + str(c) + str(d)

                    # drop할 column과 dataset 이름 설정
                    if (a==1):
                        drop_list.append('연령대예측사고율')
                    if (b==1):
                        drop_list.append('성별예측사고율')
                    if (c==1):
                        drop_list.append('TAAS성별예측사고율')
                    if (d==1):
                        drop_list.append('특약예측사고율')

                    # 데이터셋 X, y 설정
                    if(len(drop_list) != 0):
                        X_train = X_tr_rd.drop(columns = drop_list, axis = 1)
                        X_test = X_tst.drop(columns = drop_list, axis = 1)
                    else:
                        X_train = X_tr_rd
                        X_test = X_tst

                    y_train = y_tr_rd

                    # 모델 실행
                    aucc, accuracy, recall, precision, f1_sc, specificity, opt_accuracy, opt_recall, opt_precision, opt_f1_sc, opt_specificity = xgb_classifier(X_train, X_test, y_train, y_test)
                    count += 1

                    # 결과 저장 & csv 저장
                    data.append(name)
                    auc.append(aucc)

                    acc.append(accuracy)
                    prec.append(precision)
                    rec.append(recall)
                    f1.append(f1_sc)
                    spec.append(specificity)

                    optacc.append(opt_accuracy)
                    optprec.append(opt_precision)
                    optrec.append(opt_recall)
                    optf1.append(opt_f1_sc)
                    optspec.append(opt_specificity)

                    # # 진행상황 출력
                    # if (count % 4 == 0) :
                    #     print(f"{(count/16) * 100}% 완료")

    # 진행상황
    print("========== ", (i+1) / 30 * 100, "% 완료 ===========")

# 결과 DataFrame 생성
results = pd.DataFrame({
    'lstm_name' : data,
    'auc' : auc,
    'accuracy' : acc,
    'precision' : prec,
    'recall' : rec,
    "f1-score" : f1,
    "specificity" : spec,
    'opt_accuracy' : optacc,
    'opt_precision' : optprec,
    'opt_recall' : optrec,
    "opt_f1-score" : optf1,
    "opt_specificity" : optspec
})

results = results.sort_values(by=["lstm_name"])

# csv 저장
results.to_csv(f"/content/drive/MyDrive/기계학습의이해/XGB/XGB최종/결과/XGB_{data_name}_lstm(6_7).csv", index = False)



### 예측 사고율 랭킹

In [49]:
lstm_mean = results.groupby("lstm_name").mean().reset_index()
lstm_mean.sort_values(by='auc', ascending=False).head(10)

Unnamed: 0,lstm_name,auc,accuracy,precision,recall,f1-score,specificity,opt_accuracy,opt_precision,opt_recall,opt_f1-score,opt_specificity
4,1211,0.562175,0.834585,0.19559,0.061311,0.795687,0.959312,0.526971,0.161682,0.572597,0.596225,0.519612
1,1112,0.56213,0.829183,0.192847,0.072131,0.794369,0.951294,0.541752,0.162917,0.552975,0.609686,0.539941
0,1111,0.561909,0.829493,0.193166,0.071658,0.794472,0.951729,0.536006,0.162531,0.559256,0.603011,0.532256
2,1121,0.561908,0.834992,0.19585,0.060686,0.795811,0.959886,0.525912,0.161652,0.573797,0.594761,0.518188
6,1221,0.56166,0.834842,0.192706,0.059332,0.795478,0.95993,0.53728,0.162402,0.55699,0.604611,0.5341
7,1222,0.561602,0.834628,0.193266,0.060092,0.795496,0.959558,0.539184,0.162588,0.555413,0.606512,0.536566
15,2222,0.561586,0.83418,0.194816,0.061911,0.795556,0.958745,0.536958,0.162488,0.55842,0.604538,0.533496
11,2122,0.56151,0.833983,0.192982,0.061369,0.795347,0.958604,0.535629,0.162344,0.561158,0.604523,0.531511
5,1212,0.561438,0.834711,0.192319,0.059517,0.795435,0.959748,0.530611,0.161523,0.566092,0.599824,0.524889
3,1122,0.561234,0.834911,0.194864,0.060334,0.795702,0.959849,0.524334,0.161307,0.575259,0.593858,0.516119


### 예측 사고율 변수 결론

1트 : 1211 /1111 /1222

2트 : 1221 / 1112 / 1111

3트 : 1122 /1221 / 1121

4트 : 1112 / 1211 / 1222

5트 : 1222 / 1112 / 1111

1111 / 1212 / 1112

2211 1221 1222

In [50]:
# ranking 결과 데이터셋 이름 가져오기
# lstm_name = ranking.iloc[0, 0]

### 직접 입력 - 너무비슷함
lstm_name = str(1112)

# 데이터셋 생성
drop_list = []

# drop할 column과 dataset 이름 설정
if (lstm_name[0] == '1'):
    drop_list.append('연령대예측사고율')
if (lstm_name[1] == '1'):
    drop_list.append('성별예측사고율')
if (lstm_name[2] == '1'):
    drop_list.append('TAAS성별예측사고율')
if (lstm_name[3] == '1'):
    drop_list.append('특약예측사고율')


# X, y 설정
if(len(drop_list) != 0):
    yn_X = yn_X.drop(columns = drop_list, axis = 1)

yn_X.head(3)

Unnamed: 0,여성,외산,1인 및 지정1인,가족 및 지정1인,가족및형제자매한정,가족한정(형제자매제외),기명피보험자1인한정,누구나(기본),부부 및 지정1인,부부한정,...,7.0,8.0,1억이하,5천만원이하,미가입.1,연령numeric,차량경과numeric,마일리지numeric,특약전년도사고율,특약예측사고율
0,True,False,False,False,False,True,False,False,False,False,...,False,True,False,True,False,19,7.8,15,0.16,0.087615
1,True,False,False,False,False,False,False,True,False,False,...,False,True,False,True,False,19,3.16,0,0.12,0.055043
2,True,False,False,False,False,True,False,False,False,False,...,False,True,False,False,True,19,12.86,0,0.16,0.087615


In [51]:
# 파생변수 최종 데이터셋 저장

yn_X['사고유무'] = base['사고유무']
yn_X.to_csv(f"/content/drive/MyDrive/기계학습의이해/XGB/XGB최종/결과/XGB_{data_name}_파생변수.csv", index = False)