### 라이브러리 호출

In [29]:
# 연산 처리  패키지
import pandas as pd
import numpy as np
import math

# 전처리 패키지
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE, SMOTENC

# 모델 패키지
from sklearn.linear_model import LogisticRegression, LinearRegression

# 평가지표 패키지
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score, roc_curve, mean_squared_error, precision_score, recall_score

# 시각화 패키지
from matplotlib import pyplot as plt
import seaborn as sns

# 그래프 설정
%matplotlib inline

# 경고 메세지 무시
import warnings
warnings.filterwarnings('ignore')

### 파일 불러오기

In [30]:
# 코랩 mount
from google.colab import drive
drive.mount('/content/drive')

### dataframe 이름 넣기 ###
### 여기에서 데이터셋 수정 ###
data_name = "1122212"

# base preprocess 데이터
base = pd.read_csv("/content/drive/MyDrive/기계학습의이해/Dataset/base_process.csv", encoding = "euc-kr", engine='python')

# 전처리 데이터셋
preprocess = pd.read_csv(f"/content/drive/MyDrive/기계학습의이해/Dataset/DF/{data_name}.csv", encoding = 'euc-kr', engine='python')

# 파생변수 데이터셋
accident_rate = pd.read_csv("/content/drive/MyDrive/기계학습의이해/Dataset/accident_rate.csv")
predicted = pd.read_csv("/content/drive/MyDrive/기계학습의이해/Dataset/predicted_result.csv", encoding = 'UTF8', engine='python')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#### 모델 함수

In [31]:
# 로지스틱 회귀 함수
def logistic_classifier(X, y):

    acc_opt = []
    prec_opt = []
    recall_opt = []
    opt_thres = []

    # LogisticRegression 초기화
    model = LogisticRegression()

    for i in range(50):

        # smote 전 data type 변환
        bool_mask = X.dtypes == np.bool_
        bool_cols = X.columns[bool_mask].tolist()

        for col_name in bool_cols:
            X[col_name] = X[col_name].astype(int)

        # train-test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y)

        # Random Sampling
        X_train_rd = X_train.sample(n = 8000)
        y_train_rd = y_train[X_train_rd.index]

        # SMOTE oversampling
        smt = SMOTE(sampling_strategy = 'auto')
        X_train_sm, y_train_sm = smt.fit_resample(X_train_rd, y_train_rd)

        # 모델 학습 / 예측
        model.fit(X_train_sm, y_train_sm)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:,1] # 양성 클래스에 대한 확률 추출

        # Optimal thr 저장
        fper, tper, thresholds = roc_curve(y_test, y_proba)
        optimal_idx = np.argmax(tper - fper)   # fpr, tpr 간 차이가 가장 클 때의 index 저장

        # Optimal thr일 때 평가지표 저장
        y_optpred = (y_proba > thresholds[optimal_idx]).astype(int)

        acc_opt.append(accuracy_score(y_test, y_optpred))
        prec_opt.append(precision_score(y_test, y_optpred))
        recall_opt.append(recall_score(y_test, y_optpred))
        opt_thres.append(thresholds[optimal_idx])

        # # 횟수 출력
        # if((i+1)%10 == 0) :
        #     print(f"{i+1}번째 완료")

    # 결과 DataFrame 생성
    results = pd.DataFrame({
        'accuracy' : acc_opt,
        'precision' : prec_opt,
        'recall' : recall_opt,
        'opt threshold' : opt_thres
    })

    return results

### numeric 추가

In [32]:
# X, y 설정
yn_y = base['사고유무']
yn_X = preprocess

In [33]:
# 연령대 (a)
if (data_name[0] == '1'):
    yn_X['연령numeric'] = np.where(yn_X['30.0'] == True, 35,
                        np.where(yn_X['40.0'] == True, 44,
                        np.where(yn_X['50.0'] == True, 54,
                        np.where(yn_X['60.0'] == True, 64,
                        np.where(yn_X['20.0'] == True, 25,
                        np.where(yn_X['70.0'] == True, 73,
                        np.where(yn_X['80.0'] == True, 83,
                        np.where(yn_X['90.0'] == True, 93, 19)))))))) # 10대가 지워짐
elif(data_name[0] == '2'):
    yn_X['연령numeric'] = np.where(yn_X['연령대'] == 30, 35,
                            np.where(yn_X['연령대'] == 40, 44,
                            np.where(yn_X['연령대'] == 50, 54,
                            np.where(yn_X['연령대'] == 60, 64,
                            np.where(yn_X['연령대'] == 20, 25,
                            np.where(yn_X['연령대'] == 70, 73,
                            np.where(yn_X['연령대'] == 80, 83,
                            np.where(yn_X['연령대'] == 90, 93, 19))))))))

# NCR (b)
if (data_name[1] == '1'):
    yn_X['직전3년numeric'] = np.where(yn_X['N'] == True, 0,
                        np.where(yn_X['D'] == True, 1,
                        np.where(yn_X['C'] == True, 2,
                        np.where(yn_X['Z'] == True, 0.15, 3))))
elif(data_name[1] == '2'):
    yn_X['직전3년numeric'] = np.where(yn_X['직전3년간사고건수'] == 'N', 0,
                            np.where(yn_X['직전3년간사고건수'] == 'D', 1,
                            np.where(yn_X['직전3년간사고건수'] == 'C', 2,
                            np.where(yn_X['직전3년간사고건수'] == 'Z', 0.15, 3))))

# 차량경과년수 (c)
if (data_name[2] == '1'):
    yn_X['차량경과numeric'] = np.where(yn_X['10년이상'] == True, 12.86,
                        np.where(yn_X['5년이하'] == True, 3.16,
                        np.where(yn_X['10년이하'] == True, 7.8, 0)))
elif(data_name[2] == '2'):
    yn_X['차량경과numeric'] = np.where(yn_X['차량경과년수'] == 3, 12.86,
                            np.where(yn_X['차량경과년수'] == 1, 3.16,
                            np.where(yn_X['차량경과년수'] == 2, 7.8, 0)))

# 차종 (d)
if (data_name[3] == '1'):
    yn_X['차종numeric'] = np.where(yn_X['소형A'] == True, 998,
                        np.where(yn_X['소형B'] == True, 1500,
                        np.where(yn_X['중형'] == True, 1999,
                        np.where(yn_X['대형'] == True, 2740,
                        np.where(yn_X['다목적1종'] == True, 2080,
                        np.where(yn_X['다목적2종'] == True, 2080, np.nan))))))
elif(data_name[3] == '2'):
    yn_X['차종numeric'] = np.where(yn_X['차종'] == 0, 998,
                            np.where(yn_X['차종'] == 1, 1500,
                            np.where(yn_X['차종'] == 2, 1999,
                            np.where(yn_X['차종'] == 3, 2740,
                            np.where(yn_X['차종'] == 4, 2080,
                            np.where(yn_X['차종'] == 5, 2080, np.nan))))))

# 마일리지약정거리 (e)
if (data_name[4] == '1'):
    yn_X['마일리지numeric'] = np.where(yn_X['3000K'] == True, 3,
                        np.where(yn_X['5000K'] == True, 5,
                        np.where(yn_X['7000K'] == True, 7,
                        np.where(yn_X['10000K'] == True, 10,
                        np.where(yn_X['12000K'] == True, 12,
                        np.where(yn_X['15000K'] == True, 15, 0))))))
elif(data_name[4] == '2'):
    yn_X['마일리지numeric'] = np.where(yn_X['마일리지약정거리'] == 0, 3,
                            np.where(yn_X['마일리지약정거리'] == 1, 5,
                            np.where(yn_X['마일리지약정거리'] == 2, 7,
                            np.where(yn_X['마일리지약정거리'] == 3, 10,
                            np.where(yn_X['마일리지약정거리'] == 4, 12,
                            np.where(yn_X['마일리지약정거리'] == 5, 15, 0))))))


In [34]:
yn_X.head(5)

Unnamed: 0,여성,외산,1인 및 지정1인,가족 및 지정1인,가족및형제자매한정,가족한정(형제자매제외),기명피보험자1인한정,누구나(기본),부부 및 지정1인,부부한정,...,5.0,6.0,7.0,8.0,차량가입금액,연령numeric,직전3년numeric,차량경과numeric,차종numeric,마일리지numeric
0,True,False,False,False,False,True,False,False,False,False,...,False,False,False,True,1,19,2.0,7.8,1999.0,15
1,True,False,False,False,False,False,False,True,False,False,...,False,False,False,True,1,19,1.0,3.16,2080.0,0
2,True,False,False,False,False,True,False,False,False,False,...,False,False,False,True,0,19,1.0,12.86,1999.0,0
3,True,False,False,False,False,True,False,False,False,False,...,False,False,False,False,1,19,0.0,3.16,1500.0,15
4,True,False,False,False,False,True,False,False,False,False,...,False,False,False,False,1,19,0.0,3.16,1500.0,15


### numeric 모델 실행

In [35]:
# 처음 시작할 때
total = pd.DataFrame()

# 이미 실행한 게 있을 경우
# total = pd.read_csv(f"/content/drive/MyDrive/기계학습의이해/LR{data_name}_numeric결과.csv", encoding="euc-kr")

count = 0
for a in range(1,3):
    for b in range(1,3):
        for c in range(1,3):
            for d in range(1,3):
                for e in range(1,3):

                    # numeric 추가 여부 표기, 1이면 추가하지 않는 경우, 2면 추가하는 경우
                    drop_list = []
                    name = str(a) + str(b) + str(c) + str(d) + str(e)

                    # drop할 column과 dataset 이름 설정
                    if (a==1):
                        drop_list.append('연령numeric')
                    if (b==1):
                        drop_list.append('직전3년numeric')
                    if (c==1):
                        drop_list.append('차량경과numeric')
                    if (d==1):
                        drop_list.append('차종numeric')
                    if (e==1):
                        drop_list.append('마일리지numeric')

                    # 데이터셋 X, y 설정
                    if(len(drop_list) != 0):
                        X = yn_X.drop(columns = drop_list, axis = 1)
                    else:
                        X = yn_X

                    y = yn_y

                    # 모델 실행
                    logistic_rst = logistic_classifier(X, y)

                    # 결과 저장 & csv 저장
                    logistic_rst['numeric_name'] = name
                    total = pd.concat([total, logistic_rst])

                    total.to_csv(f"/content/drive/MyDrive/기계학습의이해/LR/LR_결과/LR{data_name}_numeric결과.csv", index = False, encoding="euc-kr")

                    # 진행상황 출력
                    count += 1
                    print(f"{(count/32) * 100}% 완료")


3.125% 완료
6.25% 완료
9.375% 완료
12.5% 완료
15.625% 완료
18.75% 완료
21.875% 완료
25.0% 완료
28.125% 완료
31.25% 완료
34.375% 완료
37.5% 완료
40.625% 완료
43.75% 완료
46.875% 완료
50.0% 완료
53.125% 완료
56.25% 완료
59.375% 완료
62.5% 완료
65.625% 완료
68.75% 완료
71.875% 완료
75.0% 완료
78.125% 완료
81.25% 완료
84.375% 완료
87.5% 완료
90.625% 완료
93.75% 완료
96.875% 완료
100.0% 완료


### numeric 랭킹

In [36]:
numeric_mean = total.drop(columns = ['opt threshold'], axis = 1).groupby("numeric_name").mean().reset_index()

# charting
chart_list = []

# accuracy top count
chart_list.extend(numeric_mean.sort_values(by='accuracy', ascending=False).head(1)['numeric_name'].tolist())
chart_list.extend(numeric_mean.sort_values(by='accuracy', ascending=False).head(3)['numeric_name'].tolist())
chart_list.extend(numeric_mean.sort_values(by='accuracy', ascending=False).head(5)['numeric_name'].tolist())
# chart_list.extend(numeric_mean.sort_values(by='accuracy', ascending=False).head(20)['numeric_name'].tolist())
# chart_list.extend(numeric_mean.sort_values(by='accuracy', ascending=False).head(30)['numeric_name'].tolist())

# precision top count
chart_list.extend(numeric_mean.sort_values(by='precision', ascending=False).head(1)['numeric_name'].tolist())
chart_list.extend(numeric_mean.sort_values(by='precision', ascending=False).head(3)['numeric_name'].tolist())
chart_list.extend(numeric_mean.sort_values(by='precision', ascending=False).head(5)['numeric_name'].tolist())
# chart_list.extend(numeric_mean.sort_values(by='precision', ascending=False).head(20)['numeric_name'].tolist())
# chart_list.extend(numeric_mean.sort_values(by='precision', ascending=False).head(30)['numeric_name'].tolist())

# recall top count
chart_list.extend(numeric_mean.sort_values(by='recall', ascending=False).head(1)['numeric_name'].tolist())
chart_list.extend(numeric_mean.sort_values(by='recall', ascending=False).head(3)['numeric_name'].tolist())
chart_list.extend(numeric_mean.sort_values(by='recall', ascending=False).head(5)['numeric_name'].tolist())
# chart_list.extend(numeric_mean.sort_values(by='recall', ascending=False).head(20)['numeric_name'].tolist())
# chart_list.extend(numeric_mean.sort_values(by='recall', ascending=False).head(30)['numeric_name'].tolist())

# count 및 rate 계산
count= pd.DataFrame(chart_list)
ranking = pd.DataFrame(count.value_counts()).reset_index()
ranking.columns = ['data', 'count']
ranking['rate'] = round(ranking['count']/9, 3)

ranking.head(5).sort_values(by=["count","data"], ascending = False)
# numeric

Unnamed: 0,data,count,rate
2,22122,3,0.333
1,21111,3,0.333
0,11212,3,0.333
4,11112,2,0.222
3,11111,2,0.222


In [37]:
numeric_mean.sort_values(by=["accuracy"], ascending = False).head(5)

Unnamed: 0,numeric_name,accuracy,precision,recall
16,21111,0.558711,0.19259,0.507214
20,21211,0.546686,0.193228,0.533994
0,11111,0.543464,0.191406,0.531275
4,11211,0.539762,0.193914,0.549063
1,11112,0.538376,0.194085,0.553399


In [38]:
numeric_mean.sort_values(by=["precision"], ascending = False).head(5)

Unnamed: 0,numeric_name,accuracy,precision,recall
5,11212,0.529107,0.196016,0.581029
13,12212,0.518096,0.19512,0.597984
2,11121,0.516879,0.194228,0.595525
1,11112,0.538376,0.194085,0.553399
4,11211,0.539762,0.193914,0.549063


In [39]:
numeric_mean.sort_values(by=["recall"], ascending = False).head(5)

Unnamed: 0,numeric_name,accuracy,precision,recall
27,22122,0.405453,0.178155,0.700442
31,22222,0.409683,0.177058,0.687248
15,12222,0.435735,0.181964,0.672392
14,12221,0.460539,0.18793,0.666375
10,12121,0.470301,0.188678,0.652656


### numeric 변수 결론

아니 실행할 때마다 달라지면 어카자는거지..

In [40]:
yn_X

Unnamed: 0,여성,외산,1인 및 지정1인,가족 및 지정1인,가족및형제자매한정,가족한정(형제자매제외),기명피보험자1인한정,누구나(기본),부부 및 지정1인,부부한정,...,5.0,6.0,7.0,8.0,차량가입금액,연령numeric,직전3년numeric,차량경과numeric,차종numeric,마일리지numeric
0,1,0,0,0,0,1,0,0,0,0,...,0,0,0,1,1,19,2.0,7.80,1999.0,15
1,1,0,0,0,0,0,0,1,0,0,...,0,0,0,1,1,19,1.0,3.16,2080.0,0
2,1,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,19,1.0,12.86,1999.0,0
3,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,19,0.0,3.16,1500.0,15
4,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,19,0.0,3.16,1500.0,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210310,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,1,44,2.0,3.16,1500.0,0
210311,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,44,2.0,12.86,1999.0,7
210312,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,44,2.0,3.16,998.0,15
210313,0,0,0,0,0,0,0,1,0,0,...,0,0,1,0,1,44,2.0,3.16,1500.0,0


In [41]:
# ranking 결과 데이터셋 이름 가져오기
numeric_name = ranking.iloc[0, 0]

### 직접 입력
# numeric_name = str(21111)

# 데이터셋 생성
drop_list = []
name = ""

# drop할 column과 dataset 이름 설정
if (numeric_name[0] == '1'):
    drop_list.append('연령numeric')
if (numeric_name[1] == '1'):
    drop_list.append('직전3년numeric')
if (numeric_name[2] == '1'):
    drop_list.append('차량경과numeric')
if (numeric_name[3] == '1'):
    drop_list.append('차종numeric')
if (numeric_name[4] == '1'):
    drop_list.append('마일리지numeric')


# X, y 설정
if(len(drop_list) != 0):
    yn_X = yn_X.drop(columns = drop_list, axis = 1)

yn_X.head(3)

Unnamed: 0,여성,외산,1인 및 지정1인,가족 및 지정1인,가족및형제자매한정,가족한정(형제자매제외),기명피보험자1인한정,누구나(기본),부부 및 지정1인,부부한정,...,2.0,3.0,4.0,5.0,6.0,7.0,8.0,차량가입금액,차량경과numeric,마일리지numeric
0,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,1,7.8,15
1,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,1,3.16,0
2,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,12.86,0


### 전년도 사고율 추가

In [42]:
accident_rate

Unnamed: 0,10대,20대,30대,40대,50대,60대,70대,80대,90대,여성,남성,소형,중형,대형,미가입,가족,부부,기명피보험자1인,기타
0,1.21,0.55,0.47,0.48,0.64,0.75,0.79,0.79,0.79,0.155036,0.129267,1.09,1.35,1.5,0.12,0.16,0.11,0.12,0.14


In [43]:
# 전년도 사고율

# 연령대
if (data_name[0] == '1'):
    yn_X['연령대전년도사고율'] = np.where(yn_X['30.0'] == True, accident_rate['30대'],
                        np.where(yn_X['40.0'] == True, accident_rate['40대'],
                        np.where(yn_X['50.0'] == True, accident_rate['50대'],
                        np.where(yn_X['60.0'] == True, accident_rate['60대'],
                        np.where(yn_X['20.0'] == True, accident_rate['20대'],
                        np.where(yn_X['70.0'] == True, accident_rate['70대'],
                        np.where(yn_X['80.0'] == True, accident_rate['70대'],
                        np.where(yn_X['90.0'] == True, accident_rate['70대'], accident_rate['10대'])))))))) # 10대가 지워짐
elif(data_name[0] == '2'):
    yn_X['연령대전년도사고율'] = np.where(yn_X['연령대'] == 30, accident_rate['30대'],
                                np.where(yn_X['연령대'] == 40, accident_rate['40대'],
                                np.where(yn_X['연령대'] == 50, accident_rate['50대'],
                                np.where(yn_X['연령대'] == 60, accident_rate['60대'],
                                np.where(yn_X['연령대'] == 20, accident_rate['20대'],
                                np.where(yn_X['연령대'] >= 70, accident_rate['70대'],
                                np.where(yn_X['연령대'] == 10, accident_rate['10대'],np.nan)))))))

# 성별 (원핫 고정)
yn_X['성별전년도사고율'] = np.where(yn_X['여성'] == True, accident_rate['여성'], accident_rate['남성']) # 남성이 지워짐

# 차종
if (data_name[3] == '1'):
    yn_X['차종전년도사고율'] = np.where(yn_X['중형'] == True, accident_rate['중형'],
                        np.where(yn_X['대형'] == True, accident_rate['대형'],
                        np.where(yn_X['소형B'] == True, accident_rate['소형'],
                        np.where(yn_X['다목적2종'] == True, accident_rate['대형'],
                        np.where(yn_X['소형A'] == True, accident_rate['소형'],
                        accident_rate['대형']))))) # 다목적1종이 지워짐
elif(data_name[3] == '2'):
    yn_X['차종전년도사고율'] = np.where(yn_X['차종'] == 2, accident_rate['중형'],
                            np.where(yn_X['차종'] == 3, accident_rate['대형'],
                            np.where(yn_X['차종'] == 1, accident_rate['소형'],
                            np.where(yn_X['차종'] == 5, accident_rate['대형'],
                            np.where(yn_X['차종'] == 0, accident_rate['소형'],
                            np.where(yn_X['차종'] == 4, accident_rate['대형'], np.nan))))))

# 특약 (원핫 고정)
yn_X['특약전년도사고율'] = np.where((yn_X['기명피보험자1인한정'] == True) | (yn_X['1인 및 지정1인'] == True), accident_rate['기명피보험자1인'],
                        np.where((yn_X['부부한정'] == True) | (yn_X['부부 및 지정1인'] == True), accident_rate['부부'],
                        np.where((yn_X['가족한정(형제자매제외)'] == True) | (yn_X['가족및형제자매한정'] == True) | (yn_X['가족 및 지정1인'] == True), accident_rate['가족'],
                        np.where(yn_X['누구나(기본)'] == True, accident_rate['미가입'],
                        np.where(yn_X['임직원한정'] == True, accident_rate['기타'], accident_rate['부부']))))) # 부부 및 자녀한정이 지워짐


In [44]:
yn_X.head(3)

Unnamed: 0,여성,외산,1인 및 지정1인,가족 및 지정1인,가족및형제자매한정,가족한정(형제자매제외),기명피보험자1인한정,누구나(기본),부부 및 지정1인,부부한정,...,6.0,7.0,8.0,차량가입금액,차량경과numeric,마일리지numeric,연령대전년도사고율,성별전년도사고율,차종전년도사고율,특약전년도사고율
0,1,0,0,0,0,1,0,0,0,0,...,0,0,1,1,7.8,15,1.21,0.155036,1.35,0.16
1,1,0,0,0,0,0,0,1,0,0,...,0,0,1,1,3.16,0,1.21,0.155036,1.5,0.12
2,1,0,0,0,0,1,0,0,0,0,...,0,0,1,0,12.86,0,1.21,0.155036,1.35,0.16


### 전년도 사고율 모델 실행

In [45]:
# 처음 시작할 때
total = pd.DataFrame()

# 이미 실행한 게 있을 경우
# total = pd.read_csv(f"/content/drive/MyDrive/기계학습의이해/LR{data_name}_plus결과.csv", encoding="euc-kr")

count = 0
for a in range(1,3):
    for b in range(1,3):
        for c in range(1,3):
            for d in range(1,3):

                # plus data 추가 여부 표기, 1이면 추가하지 않는 경우, 2면 추가하는 경우
                drop_list = []
                name = str(a) + str(b) + str(c) + str(d)

                # drop할 column과 dataset 이름 설정
                if (a==1):
                    drop_list.append('연령대전년도사고율')
                if (b==1):
                    drop_list.append('성별전년도사고율')
                if (c==1):
                    drop_list.append('차종전년도사고율')
                if (d==1):
                    drop_list.append('특약전년도사고율')

                # 데이터셋 X, y 설정
                if(len(drop_list) != 0):
                    X = yn_X.drop(columns = drop_list, axis = 1)
                else:
                    X = yn_X

                y = yn_y

                # 모델 실행
                logistic_rst = logistic_classifier(X, y)

                # 결과 저장 & csv 저장
                logistic_rst['plus_name'] = name
                total = pd.concat([total, logistic_rst])

                total.to_csv(f"/content/drive/MyDrive/기계학습의이해/LR/LR_결과/LR{data_name}_plus결과.csv", index = False, encoding="euc-kr")

                # 진행상황 출력
                count += 1
                print(f"{(count/16) * 100}% 완료")


6.25% 완료
12.5% 완료
18.75% 완료
25.0% 완료
31.25% 완료
37.5% 완료
43.75% 완료
50.0% 완료
56.25% 완료
62.5% 완료
68.75% 완료
75.0% 완료
81.25% 완료
87.5% 완료
93.75% 완료
100.0% 완료


### 전년도 사고율 랭킹

In [46]:
plus_mean = total.drop(columns = ['opt threshold'], axis = 1).groupby("plus_name").mean().reset_index()

# charting
chart_list = []

# accuracy top count
chart_list.extend(plus_mean.sort_values(by='accuracy', ascending=False).head(1)['plus_name'].tolist())
chart_list.extend(plus_mean.sort_values(by='accuracy', ascending=False).head(3)['plus_name'].tolist())
chart_list.extend(plus_mean.sort_values(by='accuracy', ascending=False).head(5)['plus_name'].tolist())
# chart_list.extend(plus_mean.sort_values(by='accuracy', ascending=False).head(20)['plus_name'].tolist())
# chart_list.extend(plus_mean.sort_values(by='accuracy', ascending=False).head(30)['plus_name'].tolist())

# precision top count
chart_list.extend(plus_mean.sort_values(by='precision', ascending=False).head(1)['plus_name'].tolist())
chart_list.extend(plus_mean.sort_values(by='precision', ascending=False).head(3)['plus_name'].tolist())
chart_list.extend(plus_mean.sort_values(by='precision', ascending=False).head(5)['plus_name'].tolist())
# chart_list.extend(plus_mean.sort_values(by='precision', ascending=False).head(20)['plus_name'].tolist())
# chart_list.extend(plus_mean.sort_values(by='precision', ascending=False).head(30)['plus_name'].tolist())

# recall top count
chart_list.extend(plus_mean.sort_values(by='recall', ascending=False).head(1)['plus_name'].tolist())
chart_list.extend(plus_mean.sort_values(by='recall', ascending=False).head(3)['plus_name'].tolist())
chart_list.extend(plus_mean.sort_values(by='recall', ascending=False).head(5)['plus_name'].tolist())
# chart_list.extend(plus_mean.sort_values(by='recall', ascending=False).head(20)['plus_name'].tolist())
# chart_list.extend(plus_mean.sort_values(by='recall', ascending=False).head(30)['plus_name'].tolist())

# count 및 rate 계산
count= pd.DataFrame(chart_list)
ranking = pd.DataFrame(count.value_counts()).reset_index()
ranking.columns = ['data', 'count']
ranking['rate'] = round(ranking['count']/9, 3)

ranking.head(5).sort_values(by=["count","data"], ascending = False)
# 전년도

Unnamed: 0,data,count,rate
2,2211,4,0.444
1,2112,4,0.444
0,1211,4,0.444
4,2222,3,0.333
3,2212,3,0.333


In [47]:
plus_mean.sort_values(by=["accuracy"], ascending = False).head(5)

Unnamed: 0,plus_name,accuracy,precision,recall
9,2112,0.542522,0.196754,0.557972
13,2212,0.541843,0.196637,0.55924
10,2121,0.539098,0.196235,0.563831
12,2211,0.538724,0.197263,0.568567
14,2221,0.535691,0.196223,0.568893


### 전년도 사고율 변수 결론

변수 있는 것과 없는 것 차이가 별로 없는 듯

In [48]:
# ranking 결과 데이터셋 이름 가져오기
plus_name = ranking.iloc[0, 0]

### 동점이기 때문에 accuracy 기준으로 더 높은 데이터를 사용 / 아니면 동점인 경우 무조건 포함할까 고민
# plus_name = str(2111)

# 데이터셋 생성
drop_list = []
name = ""

# drop할 column과 dataset 이름 설정
if (plus_name[0] == '1'):
    drop_list.append('연령대전년도사고율')
if (plus_name[1] == '1'):
    drop_list.append('성별전년도사고율')
if (plus_name[2] == '1'):
    drop_list.append('차종전년도사고율')
if (plus_name[3] == '1'):
    drop_list.append('특약전년도사고율')


# X, y 설정
if(len(drop_list) != 0):
    yn_X = yn_X.drop(columns = drop_list, axis = 1)

yn_X.head(3)

Unnamed: 0,여성,외산,1인 및 지정1인,가족 및 지정1인,가족및형제자매한정,가족한정(형제자매제외),기명피보험자1인한정,누구나(기본),부부 및 지정1인,부부한정,...,3.0,4.0,5.0,6.0,7.0,8.0,차량가입금액,차량경과numeric,마일리지numeric,성별전년도사고율
0,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,1,7.8,15,0.155036
1,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,1,3.16,0,0.155036
2,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,12.86,0,0.155036


### 예측 사고율 추가

In [49]:
predicted

Unnamed: 0,20대,30대,40대,50대,~64,65~,남성,여성,특약부부,특약 미가입,특약 기타,특약 기명피보험자1인,특약 가족,남성 TAAS,여성 TAAS
0,0.00551,0.003748,0.033873,0.004499,0.022487,0.010354,0.04021,0.025433,0.043751,0.055043,0.06511,0.045383,0.087615,0.008801,0.003425


In [50]:
# 예측 사고율
# 연령대
if (data_name[0] == '1'):
    yn_X['연령대예측사고율'] = np.where(yn_X['30.0'] == True, predicted['30대'],
                        np.where(yn_X['40.0'] == True, predicted['40대'],
                        np.where(yn_X['50.0'] == True, predicted['50대'],
                        np.where(yn_X['60.0'] == True, predicted['~64'],
                        np.where(yn_X['20.0'] == True, predicted['20대'],
                        np.where(yn_X['70.0'] == True, predicted['65~'],
                        np.where(yn_X['80.0'] == True, predicted['65~'],
                        np.where(yn_X['90.0'] == True, predicted['65~'], predicted['20대'])))))))) # 10대가 지워짐
elif(data_name[0] == '2'):
    yn_X['연령대예측사고율'] = np.where(yn_X['연령대'] == 30, predicted['30대'],
                            np.where(yn_X['연령대'] == 40, predicted['40대'],
                            np.where(yn_X['연령대'] == 50, predicted['50대'],
                            np.where(yn_X['연령대'] == 60, predicted['~64'],
                            np.where(yn_X['연령대'] <= 20, predicted['20대'],
                            np.where(yn_X['연령대'] >= 70, predicted['65~'], np.nan))))))

# 성별
yn_X['성별예측사고율'] = np.where(yn_X['여성'] == False, predicted['남성'], predicted['여성'])

# 특약
yn_X['특약예측사고율'] = np.where((yn_X['기명피보험자1인한정'] == True) | (yn_X['1인 및 지정1인'] == True), predicted['특약 기명피보험자1인'],
                            np.where((yn_X['부부한정'] == True) | (yn_X['부부 및 지정1인'] == True), predicted['특약부부'],
                            np.where((yn_X['가족한정(형제자매제외)'] == True) | (yn_X['가족및형제자매한정'] == True) | (yn_X['가족 및 지정1인'] == True), predicted['특약 가족'],
                            np.where(yn_X['누구나(기본)'] == True, predicted['특약 미가입'],
                            np.where(yn_X['임직원한정'] == True, predicted['특약 기타'], predicted['특약부부']))))) # 부부및자녀한정이지워짐

In [51]:
yn_X

Unnamed: 0,여성,외산,1인 및 지정1인,가족 및 지정1인,가족및형제자매한정,가족한정(형제자매제외),기명피보험자1인한정,누구나(기본),부부 및 지정1인,부부한정,...,6.0,7.0,8.0,차량가입금액,차량경과numeric,마일리지numeric,성별전년도사고율,연령대예측사고율,성별예측사고율,특약예측사고율
0,1,0,0,0,0,1,0,0,0,0,...,0,0,1,1,7.80,15,0.155036,0.005510,0.025433,0.087615
1,1,0,0,0,0,0,0,1,0,0,...,0,0,1,1,3.16,0,0.155036,0.005510,0.025433,0.055043
2,1,0,0,0,0,1,0,0,0,0,...,0,0,1,0,12.86,0,0.155036,0.005510,0.025433,0.087615
3,1,0,0,0,0,1,0,0,0,0,...,0,0,0,1,3.16,15,0.155036,0.005510,0.025433,0.087615
4,1,0,0,0,0,1,0,0,0,0,...,0,0,0,1,3.16,15,0.155036,0.005510,0.025433,0.087615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210310,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,3.16,0,0.129267,0.033873,0.040210,0.045383
210311,0,0,0,0,0,0,0,1,0,0,...,0,0,1,0,12.86,7,0.129267,0.033873,0.040210,0.055043
210312,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,3.16,15,0.129267,0.033873,0.040210,0.043751
210313,0,0,0,0,0,0,0,1,0,0,...,0,1,0,1,3.16,0,0.129267,0.033873,0.040210,0.055043


### 예측 사고율 모델 실행

In [52]:
# 처음 시작할 때
total = pd.DataFrame()

# 이미 실행한 게 있을 경우
# total = pd.read_csv(f"/content/drive/MyDrive/기계학습의이해/LR{data_name}_lstm결과.csv", encoding="euc-kr")

count = 0
for a in range(1,3):
    for b in range(1,3):
        for c in range(1,3):

            # plus data 추가 여부 표기, 1이면 추가하지 않는 경우, 2면 추가하는 경우
            drop_list = []
            name = str(a) + str(b) + str(c)

            # drop할 column과 dataset 이름 설정
            if (a==1):
                drop_list.append('연령대예측사고율')
            if (b==1):
                drop_list.append('성별예측사고율')
            if (c==1):
                drop_list.append('특약예측사고율')

            # 데이터셋 X, y 설정
            if(len(drop_list) != 0):
                X = yn_X.drop(columns = drop_list, axis = 1)
            else:
                X = yn_X

            y = yn_y

            # 모델 실행
            logistic_rst = logistic_classifier(X, y)

            # 결과 저장 & csv 저장
            logistic_rst['lstm_name'] = name
            total = pd.concat([total, logistic_rst])

            total.to_csv(f"/content/drive/MyDrive/기계학습의이해/LR/LR_결과/LR{data_name}_lstm결과.csv", index = False, encoding="euc-kr")

            # 진행상황 출력
            count += 1
            print(f"{(count/8) * 100}% 완료")


12.5% 완료
25.0% 완료
37.5% 완료
50.0% 완료
62.5% 완료
75.0% 완료
87.5% 완료
100.0% 완료


### 예측 사고율 랭킹

In [53]:
lstm_mean = total.drop(columns = ['opt threshold'], axis = 1).groupby("lstm_name").mean().reset_index()

# charting
chart_list = []

# accuracy top count
chart_list.extend(lstm_mean.sort_values(by='accuracy', ascending=False).head(1)['lstm_name'].tolist())
chart_list.extend(lstm_mean.sort_values(by='accuracy', ascending=False).head(3)['lstm_name'].tolist())
chart_list.extend(lstm_mean.sort_values(by='accuracy', ascending=False).head(5)['lstm_name'].tolist())
# chart_list.extend(lstm_mean.sort_values(by='accuracy', ascending=False).head(20)['lstm_name'].tolist())
# chart_list.extend(lstm_mean.sort_values(by='accuracy', ascending=False).head(30)['lstm_name'].tolist())

# precision top count
chart_list.extend(lstm_mean.sort_values(by='precision', ascending=False).head(1)['lstm_name'].tolist())
chart_list.extend(lstm_mean.sort_values(by='precision', ascending=False).head(3)['lstm_name'].tolist())
chart_list.extend(lstm_mean.sort_values(by='precision', ascending=False).head(5)['lstm_name'].tolist())
# chart_list.extend(lstm_mean.sort_values(by='precision', ascending=False).head(20)['lstm_name'].tolist())
# chart_list.extend(lstm_mean.sort_values(by='precision', ascending=False).head(30)['lstm_name'].tolist())

# recall top count
chart_list.extend(lstm_mean.sort_values(by='recall', ascending=False).head(1)['lstm_name'].tolist())
chart_list.extend(lstm_mean.sort_values(by='recall', ascending=False).head(3)['lstm_name'].tolist())
chart_list.extend(lstm_mean.sort_values(by='recall', ascending=False).head(5)['lstm_name'].tolist())
# chart_list.extend(lstm_mean.sort_values(by='recall', ascending=False).head(20)['lstm_name'].tolist())
# chart_list.extend(lstm_mean.sort_values(by='recall', ascending=False).head(30)['lstm_name'].tolist())

# count 및 rate 계산
count= pd.DataFrame(chart_list)
ranking = pd.DataFrame(count.value_counts()).reset_index()
ranking.columns = ['data', 'count']
ranking['rate'] = round(ranking['count']/9, 3)

ranking.head(5).sort_values(by=["count","data"], ascending = False)

Unnamed: 0,data,count,rate
0,211,6,0.667
3,212,4,0.444
2,122,4,0.444
1,111,4,0.444
4,112,3,0.333


In [54]:
lstm_mean.sort_values(by=["accuracy"], ascending = False).head(5)

Unnamed: 0,lstm_name,accuracy,precision,recall
4,211,0.542522,0.198125,0.565586
3,122,0.540402,0.197303,0.565989
6,221,0.535366,0.196715,0.57278
0,111,0.535338,0.197617,0.576677
5,212,0.531618,0.19706,0.581509


### 예측 사고율 변수 결론

In [55]:
# ranking 결과 데이터셋 이름 가져오기
lstm_name = ranking.iloc[0, 0]

### 역대급으로 비슷하다
# lstm_name = str(121)

# 데이터셋 생성
drop_list = []
name = ""

# drop할 column과 dataset 이름 설정
if (lstm_name[0] == '1'):
    drop_list.append('연령대예측사고율')
if (lstm_name[1] == '1'):
    drop_list.append('성별예측사고율')
if (lstm_name[2] == '1'):
    drop_list.append('특약예측사고율')


# X, y 설정
if(len(drop_list) != 0):
    yn_X = yn_X.drop(columns = drop_list, axis = 1)

yn_X.head(3)

Unnamed: 0,여성,외산,1인 및 지정1인,가족 및 지정1인,가족및형제자매한정,가족한정(형제자매제외),기명피보험자1인한정,누구나(기본),부부 및 지정1인,부부한정,...,4.0,5.0,6.0,7.0,8.0,차량가입금액,차량경과numeric,마일리지numeric,성별전년도사고율,연령대예측사고율
0,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,1,7.8,15,0.155036,0.00551
1,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,1,3.16,0,0.155036,0.00551
2,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,12.86,0,0.155036,0.00551


In [56]:
# 파생변수 최종 데이터셋 저장
yn_X['사고유무'] = base['사고유무']
yn_X.to_csv(f"/content/drive/MyDrive/기계학습의이해/LR/LR_결과/LR_파생변수_dataset.csv", index = False, encoding="euc-kr")