### 패키지 설치

In [1]:
# 연산 처리  패키지
import pandas as pd
import numpy as np
import math

# 전처리 패키지
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE, SMOTENC

# 모델 패키지
from xgboost.sklearn import XGBClassifier # from xgboost import XGBClassifier

# 평가지표 패키지
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score, roc_curve, mean_squared_error, precision_score, recall_score

# 시각화 패키지
from matplotlib import pyplot as plt
import seaborn as sns

# 그래프 설정
%matplotlib inline

# 경고 메세지 무시
import warnings
warnings.filterwarnings('ignore')

In [2]:
pip freeze

absl-py==1.4.0
aiohttp==3.9.5
aiosignal==1.3.1
alabaster==0.7.16
albumentations==1.3.1
altair==4.2.2
annotated-types==0.7.0
anyio==3.7.1
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
array_record==0.5.1
arviz==0.15.1
astropy==5.3.4
astunparse==1.6.3
async-timeout==4.0.3
atpublic==4.1.0
attrs==23.2.0
audioread==3.0.1
autograd==1.6.2
Babel==2.15.0
backcall==0.2.0
beautifulsoup4==4.12.3
bidict==0.23.1
bigframes==1.8.0
bleach==6.1.0
blinker==1.4
blis==0.7.11
blosc2==2.0.0
bokeh==3.3.4
bqplot==0.12.43
branca==0.7.2
build==1.2.1
CacheControl==0.14.0
cachetools==5.3.3
catalogue==2.0.10
certifi==2024.6.2
cffi==1.16.0
chardet==5.2.0
charset-normalizer==3.3.2
chex==0.1.86
click==8.1.7
click-plugins==1.1.1
cligj==0.7.2
cloudpathlib==0.16.0
cloudpickle==2.2.1
cmake==3.27.9
cmdstanpy==1.2.3
colorcet==3.1.0
colorlover==0.3.0
colour==0.1.5
community==1.0.0b1
confection==0.1.5
cons==0.4.6
contextlib2==21.6.0
contourpy==1.2.1
cryptography==42.0.7
cuda-python==12.2.1
cudf-cu12 @ https://pypi.nvidia.c

In [3]:
pip install tensorflow-gpu==2.9.0



In [4]:
  !python --version

Python 3.10.12


### 데이터 불러오기

In [5]:
# 코랩 mount
from google.colab import drive
drive.mount('/content/drive')

# 데이터 읽어오기 (고정 전처리한 데이터)
df = pd.read_csv("/content/drive/MyDrive/기계학습의이해/Dataset/base_process.csv", engine='python')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
df

Unnamed: 0,연령대,성별,국산차량여부,직전3년간사고건수,차량경과년수,차종,운전자한정특별약관,가입경력코드,차량가입금액,영상기록장치특약가입,마일리지약정거리,사고유무
0,10.0,1.0,1.0,C,10년이하,중형,가족한정(형제자매제외),8.0,5천만원이하,미가입,15000K,1
1,10.0,1.0,1.0,D,5년이하,다목적2종,누구나(기본),8.0,5천만원이하,가입,미가입,1
2,10.0,1.0,1.0,D,10년이상,중형,가족한정(형제자매제외),8.0,미가입,미가입,미가입,0
3,10.0,1.0,1.0,N,5년이하,소형B,가족한정(형제자매제외),2.0,5천만원이하,가입,15000K,0
4,10.0,1.0,1.0,N,5년이하,소형B,가족한정(형제자매제외),3.0,5천만원이하,가입,15000K,0
...,...,...,...,...,...,...,...,...,...,...,...,...
187978,40.0,2.0,1.0,C,5년이하,소형B,부부한정,7.0,미가입,가입,15000K,0
187979,40.0,2.0,1.0,C,10년이상,다목적2종,누구나(기본),5.0,5천만원이하,미가입,15000K,0
187980,40.0,2.0,1.0,C,5년이하,대형,가족한정(형제자매제외),7.0,5천만원이하,가입,미가입,1
187981,40.0,2.0,1.0,C,10년이상,중형,누구나(기본),8.0,미가입,미가입,7000K,0


### 라벨링 코드

> **컬럼네임 해석**

age, NCR, carAge, carType, mileage,exp, money 순서대로

1: one-hot

2: handled label

ex) 111112 은 전부 one-hot이고, money만 handled label로 전처리


In [7]:
def make_df(df):

    ### 연령대 ###
    # one-hot
    one_hot_age = pd.get_dummies(df['연령대'], prefix='연령대', drop_first = True).reset_index(drop=True)

    # handled-label
    labeled_age = df['연령대'].apply(lambda x: int(str(x).replace(',', '').split('.')[0])).reset_index(drop=True)

    ages = [one_hot_age, labeled_age]


    ### 직전 3년간 사고 건수 ###
    # one-hot
    one_hot_NCR = pd.get_dummies(df['직전3년간사고건수'], prefix='NCR', drop_first = True).reset_index(drop=True)

    # handled-label
    # 신규 0, 무사고 1, 1회 2, 2회 3, 3회 4, 결측치 0 은 무사고로 편입
    labeled_NCR_tmp = df['직전3년간사고건수'].apply(lambda x: str(x).replace('0', 'N'))
    labeled_NCR = labeled_NCR_tmp.apply(lambda x: int(str(x).replace('Z', '0').replace('N', '1').replace('D', '2').replace('C', '3').replace('B', '4'))).reset_index(drop=True)

    NCRs = [one_hot_NCR, labeled_NCR]


    ### 차량 경과년수 ###
    # one-hot
    one_hot_carAge = pd.get_dummies(df['차량경과년수'], prefix='차량경과년수', drop_first = True).reset_index(drop=True)

    # handled-label
    labeled_carAge = df['차량경과년수'].apply(lambda x: int(str(x).replace('신차', '0').replace('5년이하', '1').replace('10년이하', '2').replace('10년이상', '3'))).reset_index(drop=True)

    carAges = [one_hot_carAge, labeled_carAge]


    ### 차종 ###
    # one-hot
    one_hot_carType = pd.get_dummies(df['차종'], prefix='차종', drop_first = True).reset_index(drop=True)

    # handled-label
    labeled_carType = df['차종'].apply(lambda x: int(str(x).replace('소형A', '0').replace('소형B', '1').replace('중형', '2').replace('대형', '3').replace('다목적1종','4').replace('다목적2종','5').replace('기타','6'))).reset_index(drop=True)

    carTypes = [one_hot_carType, labeled_carType]


    ### 마일리지 ###
    # one-hot
    one_hot_mileage = pd.get_dummies(df['마일리지약정거리'], prefix='마일리지', drop_first = True).reset_index(drop=True)

    # handled-label
    labeled_mileage_1 = df['마일리지약정거리'].apply(lambda x: str(x).replace('12000K','4').replace('15000K','5').replace('미가입','6'))
    labeled_mileage = labeled_mileage_1.apply(lambda x: int(str(x).replace('3000K', '0').replace('5000K', '1').replace('7000K', '2').replace('10000K', '3'))).reset_index(drop=True)

    mileages = [one_hot_mileage, labeled_mileage]


    ### 가입경력 ###
    # one-hot
    one_hot_exp = pd.get_dummies(df['가입경력코드'], drop_first = True).reset_index(drop=True)

    # handled-label
    labeled_exp = df['가입경력코드'].apply(lambda x: int(str(x).split('.')[0])).reset_index(drop=True)

    exps = [one_hot_exp, labeled_exp]


    ### 가입금액 ###
    # one-hot
    one_hot_money = pd.get_dummies(df['차량가입금액'], drop_first = True).reset_index(drop=True)

    # handled-label
    labeled_money = df['차량가입금액'].apply(lambda x : int(str(x).replace('미가입','0').replace('5천만원이하','1').replace('1억이하','2').replace('1억이상','3'))).reset_index(drop=True)

    moneys = [one_hot_money, labeled_money]


    ### 고정 전처리들 ###
    one_hot_sex = pd.get_dummies(df['성별'], prefix='성별', drop_first = True)
    one_hot_kor = pd.get_dummies(df['국산차량여부'], prefix='국산', drop_first = True)
    one_hot_rule = pd.get_dummies(df['운전자한정특별약관'], prefix='약관', drop_first = True) # 순서에 의미가 없어 dummy화
    one_hot_cctv = pd.get_dummies(df['영상기록장치특약가입'], prefix='영상기록', drop_first = True)

    combined_df = pd.concat([one_hot_sex, one_hot_kor, one_hot_rule, one_hot_cctv], axis=1).reset_index(drop=True)

    # y
    df_y = df['사고유무'].reset_index(drop=True)

    return ages, NCRs, carAges, carTypes, mileages, exps, moneys, combined_df, df_y


### 모델 실행

In [8]:
# 로지스틱 회귀 함수
def xgb_classifier(X_train, X_test, y_train, y_test):

    # LogisticRegression 초기화
    model = XGBClassifier(random_state = 97) # tree_method= 'gpu_hist',

    # 모델 학습
    model.fit(X_train, y_train)

    # 예측
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1].reshape(-1,1) # 양성 클래스에 대한 확률 추출

    # 기본 평가지표
    auc = roc_auc_score(y_test, y_proba)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1score = f1_score(y_test, y_pred, average='weighted')

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity = tn / (tn+fp)

    # 최적 threshold 값 & 그 때의 FPR 출력
    fper, tper, thresholds = roc_curve(y_test, y_proba)  # thresholds 별 fpr tpr 계산
    optimal_idx = np.argmax(tper - fper)                 # fpr, tpr 간 차이가 가장 클 때의 index 저장
    optimal_fpr = fper[optimal_idx]                      # 해당 index에 위치한 fpr값 저장
    optimal_threshold = thresholds[optimal_idx]

    # 최적 threshold 평가지표
    y_optpred = (y_proba > thresholds[optimal_idx]).astype(int)

    opt_accuracy = accuracy_score(y_test, y_optpred)
    opt_precision = precision_score(y_test, y_optpred)
    opt_recall = recall_score(y_test, y_optpred)
    opt_f1score = f1_score(y_test, y_optpred, average='weighted')

    tn2, fp2, fn2, tp2 = confusion_matrix(y_test, y_optpred).ravel()
    opt_specificity = tn2 / (tn2 + fp2)

    return auc, accuracy, recall, precision, f1score, specificity, opt_accuracy, opt_recall, opt_precision, opt_f1score, opt_specificity


In [21]:
data= []
auc = []
acc = []
prec = []
rec = []
f1 = []
spec = []

optacc = []
optprec = []
optrec = []
optspec = []
optf1 = []

for i in range(30):

    # 8천개를 sampling
    now_df = df.sample(n=8000)

    # 모든 라벨링 경우의 수 생성
    ages, NCRs, carAges, carTypes, mileages, exps, moneys, combined_df, df_y = make_df(now_df)

    count = 0
    for a, age in enumerate(ages, start=1):
        for b, NCR in enumerate(NCRs, start=1):
            for c, carAge in enumerate(carAges, start=1):
                for d, carType in enumerate(carTypes, start=1):
                    for e, mileage in enumerate(mileages, start=1):
                        for f, exp in enumerate(exps, start=1):
                            for g, money in enumerate(moneys, start=1):

                                # 전처리 경우의 수 이름
                                name = str(a) + str(b) + str(c) + str(d) + str(e) + str(f) + str(g)

                                # 전처리 경우의 수를 합친 데이터프레임 생성
                                comb_df = pd.concat([combined_df, age, NCR, carAge, carType, mileage,exp, money], axis=1)

                                # smote 전 column명 string으로 변환
                                comb_df.columns = comb_df.columns.astype(str)

                                # smote 전 data type 변환
                                bool_mask = comb_df.dtypes == np.bool_
                                bool_cols = comb_df.columns[bool_mask].tolist()

                                for col_name in bool_cols:
                                    comb_df[col_name] = comb_df[col_name].astype(int)

                                # train test split
                                X_train, X_test, y_train, y_test = train_test_split(comb_df, df_y, test_size = 0.2)

                                # SMOTE를 이용하여 증강
                                smote = SMOTE(sampling_strategy='auto')
                                X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

                                # 평가지표
                                aucc, accuracy, recall, precision, f1score, specificity, opt_accuracy, opt_recall, opt_precision, opt_f1score, opt_specificity = xgb_classifier(X_train_resampled, X_test, y_train_resampled, y_test)

                                data.append(name)
                                auc.append(aucc)
                                acc.append(accuracy)
                                prec.append(precision)
                                rec.append(recall)
                                f1.append(f1score)
                                spec.append(specificity)

                                optacc.append(opt_accuracy)
                                optprec.append(opt_precision)
                                optrec.append(opt_recall)
                                optf1.append(opt_f1score)
                                optspec.append(opt_specificity)

                                # count += 1
                                # if((count+1)%16 == 0) :
                                #     print(f"{count/128 * 100}% 완료")

    print("전체 ", i/30 * 100, "% 완료")

# 결과 DataFrame 생성
results = pd.DataFrame({
    'data' : data,
    'auc' : auc,
    'accuracy' : acc,
    'precision' : prec,
    'recall' : rec,
    "f1-score" : f1,
    "specificity" : spec,
    'opt_accuracy' : optacc,
    'opt_precision' : optprec,
    'opt_recall' : optrec,
    "opt_f1-score" : optf1,
    "opt_specificity" : optspec
})

# DataFrame 저장
results.sort_values(by=["data"]).to_csv("/content/drive/MyDrive/기계학습의이해/XGB/XGB최종/결과/XGB_전처리.csv", index = False)

전체  0.0 % 완료
전체  3.3333333333333335 % 완료
전체  6.666666666666667 % 완료
전체  10.0 % 완료
전체  13.333333333333334 % 완료
전체  16.666666666666664 % 완료
전체  20.0 % 완료
전체  23.333333333333332 % 완료
전체  26.666666666666668 % 완료
전체  30.0 % 완료
전체  33.33333333333333 % 완료
전체  36.666666666666664 % 완료
전체  40.0 % 완료
전체  43.333333333333336 % 완료
전체  46.666666666666664 % 완료
전체  50.0 % 완료
전체  53.333333333333336 % 완료
전체  56.666666666666664 % 완료
전체  60.0 % 완료
전체  63.33333333333333 % 완료
전체  66.66666666666666 % 완료
전체  70.0 % 완료
전체  73.33333333333333 % 완료
전체  76.66666666666667 % 완료
전체  80.0 % 완료
전체  83.33333333333334 % 완료
전체  86.66666666666667 % 완료
전체  90.0 % 완료
전체  93.33333333333333 % 완료
전체  96.66666666666667 % 완료


### 랭킹

In [22]:
df_result = results.groupby('data').mean().reset_index()

In [27]:
# accuracy 기준 결과 내림차순
df_result.sort_values(by=["auc"], ascending = False).head(5)

Unnamed: 0,data,auc,accuracy,precision,recall,f1-score,specificity,opt_accuracy,opt_precision,opt_recall,opt_f1-score,opt_specificity
92,2122211,0.564185,0.801167,0.185064,0.138598,0.788676,0.904789,0.544729,0.167118,0.578091,0.605917,0.539708
30,1122221,0.563436,0.802583,0.186797,0.13655,0.789142,0.907051,0.515375,0.163946,0.615348,0.575538,0.499556
94,2122221,0.56239,0.792563,0.183131,0.152638,0.7843,0.893288,0.544813,0.168756,0.576251,0.60051,0.53968
76,2112211,0.560537,0.802938,0.186421,0.127855,0.787228,0.910539,0.563042,0.170889,0.548005,0.622217,0.56615
29,1122212,0.560135,0.802438,0.189506,0.131694,0.786983,0.909787,0.564583,0.173618,0.547042,0.621358,0.567273


In [24]:
# accuracy 기준 결과 내림차순
df_result.sort_values(by=["accuracy"], ascending = False).head(5)

Unnamed: 0,data,auc,accuracy,precision,recall,f1-score,specificity,opt_accuracy,opt_precision,opt_recall,opt_f1-score,opt_specificity
32,1211111,0.549528,0.815687,0.176918,0.096302,0.791861,0.928974,0.527708,0.162479,0.576954,0.587494,0.519887
50,1221121,0.549556,0.815146,0.180871,0.093549,0.789341,0.931374,0.536167,0.166466,0.563546,0.591906,0.532447
18,1121121,0.554139,0.815,0.195645,0.107243,0.791279,0.928912,0.577333,0.172637,0.51492,0.63478,0.587089
34,1211121,0.542235,0.814979,0.171581,0.090286,0.789919,0.93006,0.452271,0.157747,0.668646,0.503219,0.417458
20,1121211,0.55884,0.814875,0.184455,0.110185,0.793992,0.924738,0.533167,0.16419,0.581652,0.5939,0.52508


In [25]:
# recall 기준 결과 내림차순
df_result.sort_values(by=["recall"], ascending = False).head(5)

Unnamed: 0,data,auc,accuracy,precision,recall,f1-score,specificity,opt_accuracy,opt_precision,opt_recall,opt_f1-score,opt_specificity
127,2222222,0.545181,0.768521,0.166164,0.168046,0.76888,0.864986,0.543563,0.165586,0.551892,0.598586,0.542349
95,2122222,0.552417,0.781229,0.179275,0.162458,0.776793,0.880374,0.516833,0.164264,0.597656,0.576887,0.504367
79,2112222,0.556852,0.782104,0.182323,0.16113,0.776341,0.882876,0.558146,0.171514,0.552934,0.618372,0.558766
111,2212222,0.55007,0.772167,0.16429,0.161075,0.771185,0.869514,0.496625,0.16322,0.625638,0.551857,0.475656
94,2122221,0.56239,0.792563,0.183131,0.152638,0.7843,0.893288,0.544813,0.168756,0.576251,0.60051,0.53968


In [26]:
# precision 기준 결과 내림차순
df_result.sort_values(by=["precision"], ascending = False).head(5)

Unnamed: 0,data,auc,accuracy,precision,recall,f1-score,specificity,opt_accuracy,opt_precision,opt_recall,opt_f1-score,opt_specificity
18,1121121,0.554139,0.815,0.195645,0.107243,0.791279,0.928912,0.577333,0.172637,0.51492,0.63478,0.587089
23,1121222,0.556009,0.808208,0.191835,0.122623,0.789747,0.91755,0.521479,0.166193,0.599017,0.582375,0.508346
0,1111111,0.55446,0.814208,0.191061,0.103914,0.790011,0.928975,0.530333,0.167639,0.58145,0.587042,0.522664
81,2121112,0.547943,0.811896,0.189848,0.110848,0.790008,0.924443,0.533896,0.164172,0.56278,0.590894,0.529989
29,1122212,0.560135,0.802438,0.189506,0.131694,0.786983,0.909787,0.564583,0.173618,0.547042,0.621358,0.567273




---



최고 전처리셋

1트 : 2122211

2트 : 2122211

3트 : 1122221 / 2111211 / 2112211

4트 : 2122211 /2122221 / 2112221

5트 : 2111212 /2112221 / 2122221

2122211


3회 나온 2122211 사용하기로..