### 라이브러리 호출

In [36]:
# 연산 처리  패키지
import pandas as pd
import numpy as np
import math

# 전처리 패키지
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE, SMOTENC

# 모델 패키지
from sklearn.linear_model import LogisticRegression, LinearRegression

# 평가지표 패키지
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score, roc_curve, mean_squared_error, precision_score, recall_score

# 시각화 패키지
from matplotlib import pyplot as plt
import seaborn as sns

# 그래프 설정
%matplotlib inline

# 경고 메세지 무시
import warnings
warnings.filterwarnings('ignore')

### 파일 불러오기

In [37]:
# 코랩 mount
from google.colab import drive
drive.mount('/content/drive')

### dataframe 이름 넣기 ###
### 여기에서 데이터셋 수정 ###
data_name = "2111211"

# base preprocess 데이터
base = pd.read_csv("/content/drive/MyDrive/기계학습의이해/Dataset/base_process.csv", engine='python')

# 전처리 데이터셋
preprocess = pd.read_csv(f"/content/drive/MyDrive/기계학습의이해/Dataset/DF/{data_name}.csv", encoding = 'euc-kr', engine='python')

# 파생변수 데이터셋
accident_rate = pd.read_csv("/content/drive/MyDrive/기계학습의이해/Dataset/accident_rate.csv")
predicted = pd.read_csv("/content/drive/MyDrive/기계학습의이해/Dataset/predicted_result.csv", encoding = 'UTF8', engine='python')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### 모델 함수

In [38]:
# 로지스틱 회귀 함수
def logistic_classifier(X_train, X_test, y_train, y_test):

    # LogisticRegression 초기화
    model = LogisticRegression()

    # smote 전 data type 변환
    bool_mask = X_train.dtypes == np.bool_
    bool_cols = X_train.columns[bool_mask].tolist()
    bool_mask = X_test.dtypes == np.bool_
    bool_cols = X_test.columns[bool_mask].tolist()

    for col_name in bool_cols:
        X_train[col_name] = X_train[col_name].astype(int)
        X_test[col_name] = X_test[col_name].astype(int)

    # SMOTE oversampling
    smt = SMOTE(sampling_strategy = 'auto')
    X_train_sm, y_train_sm = smt.fit_resample(X_train, y_train)

    # 모델 학습 / 예측
    model.fit(X_train_sm, y_train_sm)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1] # 양성 클래스에 대한 확률 추출

    # 기본 평가지표
    auc = roc_auc_score(y_test, y_proba)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity = tn / (tn+fp)

    # 최적 threshold 값 & 그 때의 FPR 출력
    fper, tper, thresholds = roc_curve(y_test, y_proba)  # thresholds 별 fpr tpr 계산
    optimal_idx = np.argmax(tper - fper)                 # fpr, tpr 간 차이가 가장 클 때의 index 저장
    optimal_fpr = fper[optimal_idx]                      # 해당 index에 위치한 fpr값 저장
    optimal_threshold = thresholds[optimal_idx]

    # 최적 threshold 평가지표
    y_optpred = (y_proba > thresholds[optimal_idx]).astype(int)

    opt_accuracy = accuracy_score(y_test, y_optpred)
    opt_precision = precision_score(y_test, y_optpred)
    opt_recall = recall_score(y_test, y_optpred)
    opt_f1 = f1_score(y_test, y_optpred, average='weighted')

    tn2, fp2, fn2, tp2 = confusion_matrix(y_test, y_optpred).ravel()
    opt_specificity = tn2 / (tn2 + fp2)


    return auc, accuracy, recall, precision, f1, specificity, opt_accuracy, opt_recall, opt_precision, opt_f1, opt_specificity

### numeric 추가

In [39]:
# X, y 설정
yn_y = base['사고유무']
yn_X = preprocess

In [40]:
yn_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187983 entries, 0 to 187982
Data columns (total 36 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   여성            187983 non-null  bool 
 1   외산            187983 non-null  bool 
 2   1인 및 지정1인     187983 non-null  bool 
 3   가족 및 지정1인     187983 non-null  bool 
 4   가족및형제자매한정     187983 non-null  bool 
 5   가족한정(형제자매제외)  187983 non-null  bool 
 6   기명피보험자1인한정    187983 non-null  bool 
 7   누구나(기본)       187983 non-null  bool 
 8   부부 및 지정1인     187983 non-null  bool 
 9   부부한정          187983 non-null  bool 
 10  임직원한정         187983 non-null  bool 
 11  미가입           187983 non-null  bool 
 12  연령대           187983 non-null  int64
 13  C             187983 non-null  bool 
 14  D             187983 non-null  bool 
 15  N             187983 non-null  bool 
 16  Z             187983 non-null  bool 
 17  10년이하         187983 non-null  bool 
 18  5년이하          187983 non-null  bool 
 19  신차

In [41]:
# 연령대 (a)
if (data_name[0] == '1'):
    yn_X['연령numeric'] = np.where(yn_X['30.0'] == True, 35,
                        np.where(yn_X['40.0'] == True, 44,
                        np.where(yn_X['50.0'] == True, 54,
                        np.where(yn_X['60.0'] == True, 64,
                        np.where(yn_X['20.0'] == True, 25,
                        np.where(yn_X['70.0'] == True, 73,
                        np.where(yn_X['80.0'] == True, 83,
                        np.where(yn_X['90.0'] == True, 93, 19)))))))) # 10대가 지워짐
elif(data_name[0] == '2'):
    yn_X['연령numeric'] = np.where(yn_X['연령대'] == 30, 35,
                            np.where(yn_X['연령대'] == 40, 44,
                            np.where(yn_X['연령대'] == 50, 54,
                            np.where(yn_X['연령대'] == 60, 64,
                            np.where(yn_X['연령대'] == 20, 25,
                            np.where(yn_X['연령대'] == 70, 73,
                            np.where(yn_X['연령대'] == 80, 83,
                            np.where(yn_X['연령대'] == 90, 93, 19))))))))

# NCR (b)
if (data_name[1] == '1'):
    yn_X['직전3년numeric'] = np.where(yn_X['N'] == True, 0,
                        np.where(yn_X['D'] == True, 1,
                        np.where(yn_X['C'] == True, 2,
                        np.where(yn_X['Z'] == True, 0.15, 3))))
elif(data_name[1] == '2'):
    yn_X['직전3년numeric'] = np.where(yn_X['직전3년간사고건수'] == 'N', 0,
                            np.where(yn_X['직전3년간사고건수'] == 'D', 1,
                            np.where(yn_X['직전3년간사고건수'] == 'C', 2,
                            np.where(yn_X['직전3년간사고건수'] == 'Z', 0.15, 3))))

# 차량경과년수 (c)
if (data_name[2] == '1'):
    yn_X['차량경과numeric'] = np.where(yn_X['신차'] == True, 0,
                        np.where(yn_X['5년이하'] == True, 3.16,
                        np.where(yn_X['10년이하'] == True, 7.8, 12.86)))
elif(data_name[2] == '2'):
    yn_X['차량경과numeric'] = np.where(yn_X['차량경과년수'] == 3, 12.86,
                            np.where(yn_X['차량경과년수'] == 1, 3.16,
                            np.where(yn_X['차량경과년수'] == 2, 7.8, 0)))

# 차종 (d)
if (data_name[3] == '1'):
    yn_X['차종numeric'] = np.where(yn_X['소형A'] == True, 998,
                        np.where(yn_X['소형B'] == True, 1500,
                        np.where(yn_X['중형'] == True, 1999,
                        np.where(yn_X['대형'] == True, 2740,
                        np.where(yn_X['다목적2종'] == True, 2080, 2080)))))  # 다목적1종 지워짐
elif(data_name[3] == '2'):
    yn_X['차종numeric'] = np.where(yn_X['차종'] == 0, 998,
                            np.where(yn_X['차종'] == 1, 1500,
                            np.where(yn_X['차종'] == 2, 1999,
                            np.where(yn_X['차종'] == 3, 2740,
                            np.where(yn_X['차종'] == 4, 2080,
                            np.where(yn_X['차종'] == 5, 2080, np.nan))))))

# 마일리지약정거리 (e)
if (data_name[4] == '1'):
    yn_X['마일리지numeric'] = np.where(yn_X['3000K'] == True, 3,
                        np.where(yn_X['5000K'] == True, 5,
                        np.where(yn_X['7000K'] == True, 7,
                        np.where(yn_X['10000K'] == True, 10,
                        np.where(yn_X['12000K'] == True, 12,
                        np.where(yn_X['15000K'] == True, 15, 0))))))
elif(data_name[4] == '2'):
    yn_X['마일리지numeric'] = np.where(yn_X['마일리지약정거리'] == 0, 3,
                            np.where(yn_X['마일리지약정거리'] == 1, 5,
                            np.where(yn_X['마일리지약정거리'] == 2, 7,
                            np.where(yn_X['마일리지약정거리'] == 3, 10,
                            np.where(yn_X['마일리지약정거리'] == 4, 12,
                            np.where(yn_X['마일리지약정거리'] == 5, 15, 0))))))


In [42]:
yn_X

Unnamed: 0,여성,외산,1인 및 지정1인,가족 및 지정1인,가족및형제자매한정,가족한정(형제자매제외),기명피보험자1인한정,누구나(기본),부부 및 지정1인,부부한정,...,7.0,8.0,1억이하,5천만원이하,미가입.1,연령numeric,직전3년numeric,차량경과numeric,차종numeric,마일리지numeric
0,True,False,False,False,False,True,False,False,False,False,...,False,True,False,True,False,19,2.0,7.80,1999,15
1,True,False,False,False,False,False,False,True,False,False,...,False,True,False,True,False,19,1.0,3.16,2080,0
2,True,False,False,False,False,True,False,False,False,False,...,False,True,False,False,True,19,1.0,12.86,1999,0
3,True,False,False,False,False,True,False,False,False,False,...,False,False,False,True,False,19,0.0,3.16,1500,15
4,True,False,False,False,False,True,False,False,False,False,...,False,False,False,True,False,19,0.0,3.16,1500,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187978,False,False,False,False,False,False,False,False,False,True,...,True,False,False,False,True,44,2.0,3.16,1500,15
187979,False,False,False,False,False,False,False,True,False,False,...,False,False,False,True,False,44,2.0,12.86,2080,15
187980,False,False,False,False,False,True,False,False,False,False,...,True,False,False,True,False,44,2.0,3.16,2740,0
187981,False,False,False,False,False,False,False,True,False,False,...,False,True,False,False,True,44,2.0,12.86,1999,7


### numeric 모델 실행

In [8]:
data = []
auc = []
acc = []
prec = []
rec = []
f1 = []
spec = []

optacc = []
optprec = []
optrec = []
optspec = []
optf1 = []

for i in range(30):

    # y 범주 비율에 맞춰 train-test split
    X_tr, X_tst, y_tr, y_test = train_test_split(yn_X, yn_y, test_size=0.2, stratify=yn_y)

    # random sampling index 저장
    X_tr_rd = X_tr.sample(n=8000)
    y_tr_rd = y_tr[X_tr_rd.index]

    count = 0
    for a in range(1,3):
        for b in range(1,3):
            for c in range(1,3):
                for d in range(1,3):
                    for e in range(1,3):

                        # numeric 추가 여부 표기, 1이면 추가하지 않는 경우, 2면 추가하는 경우
                        drop_list = []
                        name = str(a) + str(b) + str(c) + str(d) + str(e)

                        # drop할 column과 dataset 이름 설정
                        if (a==1):
                            drop_list.append('연령numeric')
                        if (b==1):
                            drop_list.append('직전3년numeric')
                        if (c==1):
                            drop_list.append('차량경과numeric')
                        if (d==1):
                            drop_list.append('차종numeric')
                        if (e==1):
                            drop_list.append('마일리지numeric')

                        # 데이터셋 X, y 설정
                        if(len(drop_list) != 0):
                            X_train = X_tr_rd.drop(columns = drop_list, axis = 1)
                            X_test = X_tst.drop(columns = drop_list, axis = 1)
                        else:
                            X_train = X_tr_rd
                            X_test = X_tst

                        y_train = y_tr_rd

                        # 모델 실행
                        aucc, accuracy, recall, precision, f1_sc, specificity, opt_accuracy, opt_recall, opt_precision, opt_f1_sc, opt_specificity = logistic_classifier(X_train, X_test, y_train, y_test)
                        count += 1

                        # 결과 저장 & csv 저장
                        data.append(name)
                        auc.append(aucc)
                        acc.append(accuracy)
                        prec.append(precision)
                        rec.append(recall)
                        f1.append(f1_sc)
                        spec.append(specificity)

                        optacc.append(opt_accuracy)
                        optprec.append(opt_precision)
                        optrec.append(opt_recall)
                        optf1.append(opt_f1_sc)
                        optspec.append(opt_specificity)

    # 진행상황
    print("========== ", (i+1) / 30 * 100 , "% 완료 ===========")


# 결과 DataFrame 생성
results = pd.DataFrame({
    'numeric_name' : data,
    'auc' : auc,
    'accuracy' : acc,
    'precision' : prec,
    'recall' : rec,
    "f1-score" : f1,
    "specificity" : spec,
    'opt_accuracy' : optacc,
    'opt_precision' : optprec,
    'opt_recall' : optrec,
    "opt_f1-score" : optf1,
    "opt_specificity" : optspec
})

results = results.sort_values(by=["numeric_name"])

# csv 저장
results.to_csv(f"/content/drive/MyDrive/기계학습의이해/LR/LR최종/결과/LR_{data_name}_numeric(6_7).csv", index = False)



### numeric 랭킹

In [9]:
numeric_mean = results.groupby("numeric_name").mean().reset_index()
numeric_mean.sort_values(by='auc', ascending=False).head(10)

Unnamed: 0,numeric_name,auc,accuracy,precision,recall,f1-score,specificity,opt_accuracy,opt_precision,opt_recall,opt_f1-score,opt_specificity
5,11212,0.576567,0.823271,0.187127,0.081482,0.792293,0.94292,0.514567,0.166444,0.621077,0.584672,0.497387
1,11112,0.574009,0.811535,0.184249,0.104021,0.788478,0.925656,0.50932,0.164729,0.621652,0.580499,0.491201
4,11211,0.571032,0.811958,0.184946,0.103855,0.788738,0.926172,0.521628,0.164751,0.59843,0.59109,0.509241
13,12212,0.570536,0.831776,0.175424,0.056843,0.793233,0.956771,0.506806,0.164162,0.622233,0.577843,0.488187
9,12112,0.570234,0.819477,0.175579,0.08094,0.789958,0.938601,0.501366,0.163652,0.629197,0.571944,0.480747
21,21212,0.570154,0.821498,0.1817,0.081425,0.79124,0.94087,0.525772,0.16484,0.592359,0.595606,0.515031
17,21112,0.569633,0.808541,0.182935,0.109198,0.787364,0.921344,0.498455,0.162477,0.627084,0.569032,0.477707
12,12211,0.569596,0.821691,0.1728,0.07471,0.790284,0.942178,0.51242,0.164425,0.613347,0.582548,0.496141
20,21211,0.569275,0.809781,0.181059,0.104826,0.787533,0.923488,0.52977,0.164959,0.585223,0.598813,0.520826
0,11111,0.567556,0.800784,0.184401,0.126893,0.784802,0.909481,0.500497,0.161987,0.620548,0.570538,0.481134


### numeric 변수 결론

1트 : 11212 / 11112 / 12212

2트 : 11212 /

3트 : 11212

4트 :

In [43]:
# ranking 결과 데이터셋 이름 가져오기
# numeric_name = ranking.iloc[0, 0]

### 직접 입력
numeric_name = str(11212)

# 데이터셋 생성
drop_list = []
name = ""

# drop할 column과 dataset 이름 설정
if (numeric_name[0] == '1'):
    drop_list.append('연령numeric')
if (numeric_name[1] == '1'):
    drop_list.append('직전3년numeric')
if (numeric_name[2] == '1'):
    drop_list.append('차량경과numeric')
if (numeric_name[3] == '1'):
    drop_list.append('차종numeric')
if (numeric_name[4] == '1'):
    drop_list.append('마일리지numeric')


# X, y 설정
if(len(drop_list) != 0):
    yn_X = yn_X.drop(columns = drop_list, axis = 1)

yn_X.head(3)

Unnamed: 0,여성,외산,1인 및 지정1인,가족 및 지정1인,가족및형제자매한정,가족한정(형제자매제외),기명피보험자1인한정,누구나(기본),부부 및 지정1인,부부한정,...,4.0,5.0,6.0,7.0,8.0,1억이하,5천만원이하,미가입.1,차량경과numeric,마일리지numeric
0,True,False,False,False,False,True,False,False,False,False,...,False,False,False,False,True,False,True,False,7.8,15
1,True,False,False,False,False,False,False,True,False,False,...,False,False,False,False,True,False,True,False,3.16,0
2,True,False,False,False,False,True,False,False,False,False,...,False,False,False,False,True,False,False,True,12.86,0


### 전년도 사고율 추가

In [44]:
accident_rate

Unnamed: 0,10대,20대,30대,40대,50대,60대,70대,80대,90대,여성,남성,소형,중형,대형,미가입,가족,부부,기명피보험자1인,기타
0,1.21,0.55,0.47,0.48,0.64,0.75,0.79,0.79,0.79,0.155036,0.129267,1.09,1.35,1.5,0.12,0.16,0.11,0.12,0.14


In [45]:
# 전년도 사고율

# 연령대
if (data_name[0] == '1'):
    yn_X['연령대전년도사고율'] = np.where(yn_X['30.0'] == True, accident_rate['30대'],
                        np.where(yn_X['40.0'] == True, accident_rate['40대'],
                        np.where(yn_X['50.0'] == True, accident_rate['50대'],
                        np.where(yn_X['60.0'] == True, accident_rate['60대'],
                        np.where(yn_X['20.0'] == True, accident_rate['20대'],
                        np.where(yn_X['70.0'] == True, accident_rate['70대'],
                        np.where(yn_X['80.0'] == True, accident_rate['70대'],
                        np.where(yn_X['90.0'] == True, accident_rate['70대'], accident_rate['10대'])))))))) # 10대가 지워짐
elif(data_name[0] == '2'):
    yn_X['연령대전년도사고율'] = np.where(yn_X['연령대'] == 30, accident_rate['30대'],
                                np.where(yn_X['연령대'] == 40, accident_rate['40대'],
                                np.where(yn_X['연령대'] == 50, accident_rate['50대'],
                                np.where(yn_X['연령대'] == 60, accident_rate['60대'],
                                np.where(yn_X['연령대'] == 20, accident_rate['20대'],
                                np.where(yn_X['연령대'] >= 70, accident_rate['70대'],
                                np.where(yn_X['연령대'] == 10, accident_rate['10대'],np.nan)))))))

# 성별 (원핫 고정)
yn_X['성별전년도사고율'] = np.where(yn_X['여성'] == True, accident_rate['여성'], accident_rate['남성']) # 남성이 지워짐

# 차종
if (data_name[3] == '1'):
    yn_X['차종전년도사고율'] = np.where(yn_X['중형'] == True, accident_rate['중형'],
                        np.where(yn_X['대형'] == True, accident_rate['대형'],
                        np.where(yn_X['소형B'] == True, accident_rate['소형'],
                        np.where(yn_X['다목적2종'] == True, accident_rate['대형'],
                        np.where(yn_X['소형A'] == True, accident_rate['소형'],
                        accident_rate['대형']))))) # 다목적1종이 지워짐
elif(data_name[3] == '2'):
    yn_X['차종전년도사고율'] = np.where(yn_X['차종'] == 2, accident_rate['중형'],
                            np.where(yn_X['차종'] == 3, accident_rate['대형'],
                            np.where(yn_X['차종'] == 1, accident_rate['소형'],
                            np.where(yn_X['차종'] == 5, accident_rate['대형'],
                            np.where(yn_X['차종'] == 0, accident_rate['소형'],
                            np.where(yn_X['차종'] == 4, accident_rate['대형'], np.nan))))))

# 특약 (원핫 고정)
yn_X['특약전년도사고율'] = np.where((yn_X['기명피보험자1인한정'] == True) | (yn_X['1인 및 지정1인'] == True), accident_rate['기명피보험자1인'],
                        np.where((yn_X['부부한정'] == True) | (yn_X['부부 및 지정1인'] == True), accident_rate['부부'],
                        np.where((yn_X['가족한정(형제자매제외)'] == True) | (yn_X['가족및형제자매한정'] == True) | (yn_X['가족 및 지정1인'] == True), accident_rate['가족'],
                        np.where(yn_X['누구나(기본)'] == True, accident_rate['미가입'],
                        np.where(yn_X['임직원한정'] == True, accident_rate['기타'], accident_rate['부부']))))) # 부부 및 자녀한정이 지워짐


In [46]:
yn_X.head(3)

Unnamed: 0,여성,외산,1인 및 지정1인,가족 및 지정1인,가족및형제자매한정,가족한정(형제자매제외),기명피보험자1인한정,누구나(기본),부부 및 지정1인,부부한정,...,8.0,1억이하,5천만원이하,미가입.1,차량경과numeric,마일리지numeric,연령대전년도사고율,성별전년도사고율,차종전년도사고율,특약전년도사고율
0,True,False,False,False,False,True,False,False,False,False,...,True,False,True,False,7.8,15,1.21,0.155036,1.35,0.16
1,True,False,False,False,False,False,False,True,False,False,...,True,False,True,False,3.16,0,1.21,0.155036,1.5,0.12
2,True,False,False,False,False,True,False,False,False,False,...,True,False,False,True,12.86,0,1.21,0.155036,1.35,0.16


### 전년도 사고율 모델 실행

In [47]:
data = []
auc = []
acc = []
prec = []
rec = []
f1 = []
spec = []

optacc = []
optprec = []
optrec = []
optspec = []
optf1 = []


for i in range(30):

    # y 범주 비율에 맞춰 train-test split
    X_tr, X_tst, y_tr, y_test = train_test_split(yn_X, yn_y, test_size=0.2, stratify=yn_y)

    # random sampling index 저장
    X_tr_rd = X_tr.sample(n=8000)
    y_tr_rd = y_tr[X_tr_rd.index]

    count = 0
    for a in range(1,3):
        for b in range(1,3):
            for c in range(1,3):
                for d in range(1,3):

                    # numeric 추가 여부 표기, 1이면 추가하지 않는 경우, 2면 추가하는 경우
                    drop_list = []
                    name = str(a) + str(b) + str(c) + str(d)

                    # drop할 column과 dataset 이름 설정
                    if (a==1):
                        drop_list.append('연령대전년도사고율')
                    if (b==1):
                        drop_list.append('성별전년도사고율')
                    if (c==1):
                        drop_list.append('차종전년도사고율')
                    if (d==1):
                        drop_list.append('특약전년도사고율')

                    # 데이터셋 X, y 설정
                    if(len(drop_list) != 0):
                        X_train = X_tr_rd.drop(columns = drop_list, axis = 1)
                        X_test = X_tst.drop(columns = drop_list, axis = 1)
                    else:
                        X_train = X_tr_rd
                        X_test = X_tst

                    y_train = y_tr_rd

                    # 모델 실행
                    aucc, accuracy, recall, precision, f1_sc, specificity, opt_accuracy, opt_recall, opt_precision, opt_f1_sc, opt_specificity = logistic_classifier(X_train, X_test, y_train, y_test)
                    count += 1

                    # 결과 저장 & csv 저장
                    data.append(name)
                    auc.append(aucc)
                    acc.append(accuracy)
                    prec.append(precision)
                    rec.append(recall)
                    f1.append(f1_sc)
                    spec.append(specificity)

                    optacc.append(opt_accuracy)
                    optprec.append(opt_precision)
                    optrec.append(opt_recall)
                    optf1.append(opt_f1_sc)
                    optspec.append(opt_specificity)

                    # # 진행상황 출력
                    # if (count % 4 == 0) :
                    #     print(f"{(count/16) * 100}% 완료")

    # 진행상황
    print("========== ", (i+1) / 30 * 100, "% 완료 ===========")


# 결과 DataFrame 생성
results = pd.DataFrame({
    'plus_name' : data,
    'auc' : auc,
    'accuracy' : acc,
    'precision' : prec,
    'recall' : rec,
    "f1-score" : f1,
    "specificity" : spec,
    'opt_accuracy' : optacc,
    'opt_precision' : optprec,
    'opt_recall' : optrec,
    "opt_f1-score" : optf1,
    "opt_specificity" : optspec
})

results = results.sort_values(by=["plus_name"])

# csv 저장
results.to_csv(f"/content/drive/MyDrive/기계학습의이해/XGB/XGB최종/결과/XGB_{data_name}_plus(6_7).csv", index = False)



### 전년도 사고율 랭킹

In [15]:
plus_mean = results.groupby("plus_name").mean().reset_index()
plus_mean.sort_values(by='auc', ascending=False).head(10)

Unnamed: 0,plus_name,auc,accuracy,precision,recall,f1-score,specificity,opt_accuracy,opt_precision,opt_recall,opt_f1-score,opt_specificity
5,1212,0.57675,0.825089,0.187532,0.077901,0.792838,0.945608,0.50215,0.165445,0.637285,0.572429,0.480353
1,1112,0.576676,0.824459,0.188064,0.079854,0.792757,0.944562,0.50883,0.165891,0.627933,0.579153,0.48962
0,1111,0.576234,0.823828,0.187034,0.08025,0.792462,0.943765,0.501283,0.165292,0.637419,0.571187,0.479325
4,1211,0.576149,0.825241,0.188224,0.078086,0.792977,0.945755,0.50641,0.165341,0.629797,0.577021,0.486508
9,2112,0.576106,0.823685,0.187462,0.080965,0.792492,0.943483,0.512821,0.165961,0.621301,0.58297,0.495324
13,2212,0.575917,0.824439,0.187946,0.079625,0.792724,0.944576,0.507543,0.165517,0.628342,0.578088,0.488059
12,2211,0.575351,0.823869,0.186457,0.079995,0.79245,0.943854,0.513698,0.165855,0.618914,0.58334,0.496727
8,2111,0.57451,0.823291,0.186771,0.081431,0.792293,0.942951,0.512328,0.165405,0.619265,0.58279,0.49508
2,1121,0.573495,0.823332,0.189306,0.083065,0.792594,0.942735,0.506652,0.164446,0.624173,0.577535,0.487696
6,1221,0.573382,0.823876,0.18993,0.082146,0.792804,0.943515,0.503696,0.164085,0.627659,0.574745,0.483701


### 전년도 사고율 변수 결론

1트 : 1212

2트 : 1212

3트 :1212



In [48]:
# ranking 결과 데이터셋 이름 가져오기
# plus_name = ranking.iloc[0, 0]

### 동점이기 때문에 accuracy 기준으로 더 높은 데이터를 사용 / 아니면 동점인 경우 무조건 포함할까 고민
plus_name = str(1212)

# 데이터셋 생성
drop_list = []
name = ""

# drop할 column과 dataset 이름 설정
if (plus_name[0] == '1'):
    drop_list.append('연령대전년도사고율')
if (plus_name[1] == '1'):
    drop_list.append('성별전년도사고율')
if (plus_name[2] == '1'):
    drop_list.append('차종전년도사고율')
if (plus_name[3] == '1'):
    drop_list.append('특약전년도사고율')


# X, y 설정
if(len(drop_list) != 0):
    yn_X = yn_X.drop(columns = drop_list, axis = 1)

yn_X.head(3)

Unnamed: 0,여성,외산,1인 및 지정1인,가족 및 지정1인,가족및형제자매한정,가족한정(형제자매제외),기명피보험자1인한정,누구나(기본),부부 및 지정1인,부부한정,...,6.0,7.0,8.0,1억이하,5천만원이하,미가입.1,차량경과numeric,마일리지numeric,성별전년도사고율,특약전년도사고율
0,True,False,False,False,False,True,False,False,False,False,...,False,False,True,False,True,False,7.8,15,0.155036,0.16
1,True,False,False,False,False,False,False,True,False,False,...,False,False,True,False,True,False,3.16,0,0.155036,0.12
2,True,False,False,False,False,True,False,False,False,False,...,False,False,True,False,False,True,12.86,0,0.155036,0.16


### 예측 사고율 추가

In [49]:
predicted

Unnamed: 0,20대,30대,40대,50대,~64,65~,남성,여성,특약부부,특약 미가입,특약 기타,특약 기명피보험자1인,특약 가족,남성 TAAS,여성 TAAS
0,0.00551,0.003748,0.033873,0.004499,0.022487,0.010354,0.04021,0.025433,0.043751,0.055043,0.06511,0.045383,0.087615,0.008801,0.003425


In [50]:
# 예측 사고율
# 연령대
if (data_name[0] == '1'):
    yn_X['연령대예측사고율'] = np.where(yn_X['30.0'] == True, predicted['30대'],
                        np.where(yn_X['40.0'] == True, predicted['40대'],
                        np.where(yn_X['50.0'] == True, predicted['50대'],
                        np.where(yn_X['60.0'] == True, predicted['~64'],
                        np.where(yn_X['20.0'] == True, predicted['20대'],
                        np.where(yn_X['70.0'] == True, predicted['65~'],
                        np.where(yn_X['80.0'] == True, predicted['65~'],
                        np.where(yn_X['90.0'] == True, predicted['65~'], predicted['20대'])))))))) # 10대가 지워짐
elif(data_name[0] == '2'):
    yn_X['연령대예측사고율'] = np.where(yn_X['연령대'] == 30, predicted['30대'],
                            np.where(yn_X['연령대'] == 40, predicted['40대'],
                            np.where(yn_X['연령대'] == 50, predicted['50대'],
                            np.where(yn_X['연령대'] == 60, predicted['~64'],
                            np.where(yn_X['연령대'] <= 20, predicted['20대'],
                            np.where(yn_X['연령대'] >= 70, predicted['65~'], np.nan))))))

# 성별 taas
yn_X['TAAS성별예측사고율'] = np.where(yn_X['여성'] == False, predicted['남성 TAAS'], predicted['여성 TAAS'])

# 성별
yn_X['성별예측사고율'] = np.where(yn_X['여성'] == False, predicted['남성'], predicted['여성'])

# 특약
yn_X['특약예측사고율'] = np.where((yn_X['기명피보험자1인한정'] == True) | (yn_X['1인 및 지정1인'] == True), predicted['특약 기명피보험자1인'],
                            np.where((yn_X['부부한정'] == True) | (yn_X['부부 및 지정1인'] == True), predicted['특약부부'],
                            np.where((yn_X['가족한정(형제자매제외)'] == True) | (yn_X['가족및형제자매한정'] == True) | (yn_X['가족 및 지정1인'] == True), predicted['특약 가족'],
                            np.where(yn_X['누구나(기본)'] == True, predicted['특약 미가입'],
                            np.where(yn_X['임직원한정'] == True, predicted['특약 기타'], predicted['특약부부']))))) # 부부및자녀한정이지워짐

In [51]:
yn_X

Unnamed: 0,여성,외산,1인 및 지정1인,가족 및 지정1인,가족및형제자매한정,가족한정(형제자매제외),기명피보험자1인한정,누구나(기본),부부 및 지정1인,부부한정,...,5천만원이하,미가입.1,차량경과numeric,마일리지numeric,성별전년도사고율,특약전년도사고율,연령대예측사고율,TAAS성별예측사고율,성별예측사고율,특약예측사고율
0,True,False,False,False,False,True,False,False,False,False,...,True,False,7.80,15,0.155036,0.16,0.005510,0.003425,0.025433,0.087615
1,True,False,False,False,False,False,False,True,False,False,...,True,False,3.16,0,0.155036,0.12,0.005510,0.003425,0.025433,0.055043
2,True,False,False,False,False,True,False,False,False,False,...,False,True,12.86,0,0.155036,0.16,0.005510,0.003425,0.025433,0.087615
3,True,False,False,False,False,True,False,False,False,False,...,True,False,3.16,15,0.155036,0.16,0.005510,0.003425,0.025433,0.087615
4,True,False,False,False,False,True,False,False,False,False,...,True,False,3.16,15,0.155036,0.16,0.005510,0.003425,0.025433,0.087615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187978,False,False,False,False,False,False,False,False,False,True,...,False,True,3.16,15,0.129267,0.11,0.033873,0.008801,0.040210,0.043751
187979,False,False,False,False,False,False,False,True,False,False,...,True,False,12.86,15,0.129267,0.12,0.033873,0.008801,0.040210,0.055043
187980,False,False,False,False,False,True,False,False,False,False,...,True,False,3.16,0,0.129267,0.16,0.033873,0.008801,0.040210,0.087615
187981,False,False,False,False,False,False,False,True,False,False,...,False,True,12.86,7,0.129267,0.12,0.033873,0.008801,0.040210,0.055043


### 예측 사고율 모델 실행

In [54]:
data = []
auc = []
acc = []
prec = []
rec = []
f1 = []
spec = []

optacc = []
optprec = []
optrec = []
optspec = []
optf1 = []

for i in range(30):

    # y 범주 비율에 맞춰 train-test split
    X_tr, X_tst, y_tr, y_test = train_test_split(yn_X, yn_y, test_size=0.2, stratify=yn_y)

    # random sampling index 저장
    X_tr_rd = X_tr.sample(n=8000)
    y_tr_rd = y_tr[X_tr_rd.index]

    count = 0
    for a in range(1,3):
        for b in range(1,3):
            for c in range(1,3):
                for d in range(1,3):

                    # numeric 추가 여부 표기, 1이면 추가하지 않는 경우, 2면 추가하는 경우
                    drop_list = []
                    name = str(a) + str(b) + str(c) + str(d)

                    # drop할 column과 dataset 이름 설정
                    if (a==1):
                        drop_list.append('연령대예측사고율')
                    if (b==1):
                        drop_list.append('성별예측사고율')
                    if (c==1):
                        drop_list.append('TAAS성별예측사고율')
                    if (d==1):
                        drop_list.append('특약예측사고율')

                    # 데이터셋 X, y 설정
                    if(len(drop_list) != 0):
                        X_train = X_tr_rd.drop(columns = drop_list, axis = 1)
                        X_test = X_tst.drop(columns = drop_list, axis = 1)
                    else:
                        X_train = X_tr_rd
                        X_test = X_tst

                    y_train = y_tr_rd

                    # 모델 실행
                    aucc, accuracy, recall, precision, f1_sc, specificity, opt_accuracy, opt_recall, opt_precision, opt_f1_sc, opt_specificity = logistic_classifier(X_train, X_test, y_train, y_test)
                    count += 1

                    # 결과 저장 & csv 저장
                    data.append(name)
                    auc.append(aucc)
                    acc.append(accuracy)
                    prec.append(precision)
                    rec.append(recall)
                    f1.append(f1_sc)
                    spec.append(specificity)

                    optacc.append(opt_accuracy)
                    optprec.append(opt_precision)
                    optrec.append(opt_recall)
                    optf1.append(opt_f1_sc)
                    optspec.append(opt_specificity)

                    # # 진행상황 출력
                    # if (count % 4 == 0) :
                    #     print(f"{(count/16) * 100}% 완료")

    # 진행상황
    print("========== ", (i+1) / 30 * 100, "% 완료 ===========")

# 결과 DataFrame 생성
results = pd.DataFrame({
    'lstm_name' : data,
    'auc' : auc,
    'accuracy' : acc,
    'precision' : prec,
    'recall' : rec,
    "f1-score" : f1,
    "specificity" : spec,
    'opt_accuracy' : optacc,
    'opt_precision' : optprec,
    'opt_recall' : optrec,
    "opt_f1-score" : optf1,
    "opt_specificity" : optspec
})

results = results.sort_values(by=["lstm_name"])

# csv 저장
results.to_csv(f"/content/drive/MyDrive/기계학습의이해/LR/LR최종/결과/LR_{data_name}_lstm(6_7).csv", index = False)



### 예측 사고율 랭킹

In [55]:
lstm_mean = results.groupby("lstm_name").mean().reset_index()
lstm_mean.sort_values(by='auc', ascending=False).head(10)

Unnamed: 0,lstm_name,auc,accuracy,precision,recall,f1-score,specificity,opt_accuracy,opt_precision,opt_recall,opt_f1-score,opt_specificity
13,2212,0.576555,0.825058,0.187706,0.078099,0.792874,0.94554,0.50662,0.165396,0.629299,0.576891,0.486832
0,1111,0.576364,0.824084,0.187752,0.080237,0.792615,0.944065,0.51551,0.166151,0.61772,0.585332,0.499024
15,2222,0.576328,0.823773,0.187087,0.080563,0.792471,0.943651,0.525528,0.166776,0.603013,0.595019,0.51303
10,2121,0.576275,0.824879,0.187958,0.078552,0.792837,0.945259,0.502596,0.165278,0.636097,0.573339,0.481063
12,2211,0.576219,0.823129,0.188279,0.082784,0.79243,0.942544,0.512566,0.165954,0.621595,0.582639,0.49498
1,1112,0.576201,0.824717,0.187279,0.078431,0.792719,0.94509,0.509675,0.16562,0.62559,0.580415,0.490978
2,1121,0.576073,0.824284,0.187776,0.07965,0.792647,0.944391,0.510228,0.165619,0.624352,0.580795,0.49182
8,2111,0.575924,0.823217,0.187834,0.082274,0.792413,0.942729,0.51006,0.165788,0.625188,0.58023,0.49149
6,1221,0.575775,0.823346,0.186812,0.081246,0.792326,0.943045,0.504704,0.165123,0.631891,0.575646,0.484189
9,2112,0.575716,0.824085,0.187604,0.080218,0.792614,0.944069,0.517798,0.166308,0.613232,0.586812,0.502405


### 예측 사고율 변수 결론

1트 : 1122

2트 : 1122

3트 : 1222

4트 : 1221

5트 : 1211

1211

2222

2121

1221

1111

2112

2222


In [56]:
# ranking 결과 데이터셋 이름 가져오기
# lstm_name = ranking.iloc[0, 0]

### 직접 입력 (경향성 엇ㅂ어서 그냥 다 넣음)
lstm_name = str(2222)

# 데이터셋 생성
drop_list = []
name = ""

# drop할 column과 dataset 이름 설정
if (lstm_name[0] == '1'):
    drop_list.append('연령대예측사고율')
if (lstm_name[1] == '1'):
    drop_list.append('성별예측사고율')
if (lstm_name[2] == '1'):
    drop_list.append('TAAS성별예측사고율')
if (lstm_name[3] == '1'):
    drop_list.append('특약예측사고율')


# X, y 설정
if(len(drop_list) != 0):
    yn_X = yn_X.drop(columns = drop_list, axis = 1)

yn_X.head(3)

Unnamed: 0,여성,외산,1인 및 지정1인,가족 및 지정1인,가족및형제자매한정,가족한정(형제자매제외),기명피보험자1인한정,누구나(기본),부부 및 지정1인,부부한정,...,5천만원이하,미가입.1,차량경과numeric,마일리지numeric,성별전년도사고율,특약전년도사고율,연령대예측사고율,TAAS성별예측사고율,성별예측사고율,특약예측사고율
0,True,False,False,False,False,True,False,False,False,False,...,True,False,7.8,15,0.155036,0.16,0.00551,0.003425,0.025433,0.087615
1,True,False,False,False,False,False,False,True,False,False,...,True,False,3.16,0,0.155036,0.12,0.00551,0.003425,0.025433,0.055043
2,True,False,False,False,False,True,False,False,False,False,...,False,True,12.86,0,0.155036,0.16,0.00551,0.003425,0.025433,0.087615


In [58]:
# 파생변수 최종 데이터셋 저장

yn_X['사고유무'] = base['사고유무']
yn_X.to_csv(f"/content/drive/MyDrive/기계학습의이해/LR/LR최종/결과/LR_{data_name}_파생변수.csv", index = False)