### 패키지 설치

In [1]:
# 연산 처리  패키지
import pandas as pd
import numpy as np
import math

# 전처리 패키지
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE, SMOTENC

# 모델 패키지
from sklearn.linear_model import LogisticRegression, LinearRegression

# 평가지표 패키지
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score, roc_curve, mean_squared_error, precision_score, recall_score

# 시각화 패키지
from matplotlib import pyplot as plt
import seaborn as sns

# 그래프 설정
%matplotlib inline

# 경고 메세지 무시
import warnings
warnings.filterwarnings('ignore')

### 데이터 불러오기

In [2]:
# 코랩 mount
from google.colab import drive
drive.mount('/content/drive')

# 데이터 읽어오기 (고정 전처리한 데이터)
df = pd.read_csv("/content/drive/MyDrive/기계학습의이해/Dataset/base_process.csv", engine='python')

Mounted at /content/drive


In [3]:
df

Unnamed: 0,연령대,성별,국산차량여부,직전3년간사고건수,차량경과년수,차종,운전자한정특별약관,가입경력코드,차량가입금액,영상기록장치특약가입,마일리지약정거리,사고유무
0,10.0,1.0,1.0,C,10년이하,중형,가족한정(형제자매제외),8.0,5천만원이하,미가입,15000K,1
1,10.0,1.0,1.0,D,5년이하,다목적2종,누구나(기본),8.0,5천만원이하,가입,미가입,1
2,10.0,1.0,1.0,D,10년이상,중형,가족한정(형제자매제외),8.0,미가입,미가입,미가입,0
3,10.0,1.0,1.0,N,5년이하,소형B,가족한정(형제자매제외),2.0,5천만원이하,가입,15000K,0
4,10.0,1.0,1.0,N,5년이하,소형B,가족한정(형제자매제외),3.0,5천만원이하,가입,15000K,0
...,...,...,...,...,...,...,...,...,...,...,...,...
187978,40.0,2.0,1.0,C,5년이하,소형B,부부한정,7.0,미가입,가입,15000K,0
187979,40.0,2.0,1.0,C,10년이상,다목적2종,누구나(기본),5.0,5천만원이하,미가입,15000K,0
187980,40.0,2.0,1.0,C,5년이하,대형,가족한정(형제자매제외),7.0,5천만원이하,가입,미가입,1
187981,40.0,2.0,1.0,C,10년이상,중형,누구나(기본),8.0,미가입,미가입,7000K,0


### 라벨링 코드

> **컬럼네임 해석**

age, NCR, carAge, carType, mileage,exp, money 순서대로

1: one-hot

2: handled label

ex) 111112 은 전부 one-hot이고, money만 handled label로 전처리


In [4]:
def make_df(df):

    ### 연령대 ###
    # one-hot
    one_hot_age = pd.get_dummies(df['연령대'], prefix='연령대', drop_first = True).reset_index(drop=True)

    # handled-label
    labeled_age = df['연령대'].apply(lambda x: int(str(x).replace(',', '').split('.')[0])).reset_index(drop=True)

    ages = [one_hot_age, labeled_age]


    ### 직전 3년간 사고 건수 ###
    # one-hot
    one_hot_NCR = pd.get_dummies(df['직전3년간사고건수'], prefix='NCR', drop_first = True).reset_index(drop=True)

    # handled-label
    # 신규 0, 무사고 1, 1회 2, 2회 3, 3회 4, 결측치 0 은 무사고로 편입
    labeled_NCR_tmp = df['직전3년간사고건수'].apply(lambda x: str(x).replace('0', 'N'))
    labeled_NCR = labeled_NCR_tmp.apply(lambda x: int(str(x).replace('Z', '0').replace('N', '1').replace('D', '2').replace('C', '3').replace('B', '4'))).reset_index(drop=True)

    NCRs = [one_hot_NCR, labeled_NCR]


    ### 차량 경과년수 ###
    # one-hot
    one_hot_carAge = pd.get_dummies(df['차량경과년수'], prefix='차량경과년수', drop_first = True).reset_index(drop=True)

    # handled-label
    labeled_carAge = df['차량경과년수'].apply(lambda x: int(str(x).replace('신차', '0').replace('5년이하', '1').replace('10년이하', '2').replace('10년이상', '3'))).reset_index(drop=True)

    carAges = [one_hot_carAge, labeled_carAge]


    ### 차종 ###
    # one-hot
    one_hot_carType = pd.get_dummies(df['차종'], prefix='차종', drop_first = True).reset_index(drop=True)

    # handled-label
    labeled_carType = df['차종'].apply(lambda x: int(str(x).replace('소형A', '0').replace('소형B', '1').replace('중형', '2').replace('대형', '3').replace('다목적1종','4').replace('다목적2종','5').replace('기타','6'))).reset_index(drop=True)

    carTypes = [one_hot_carType, labeled_carType]


    ### 마일리지 ###
    # one-hot
    one_hot_mileage = pd.get_dummies(df['마일리지약정거리'], prefix='마일리지', drop_first = True).reset_index(drop=True)

    # handled-label
    labeled_mileage_1 = df['마일리지약정거리'].apply(lambda x: str(x).replace('12000K','4').replace('15000K','5').replace('미가입','6'))
    labeled_mileage = labeled_mileage_1.apply(lambda x: int(str(x).replace('3000K', '0').replace('5000K', '1').replace('7000K', '2').replace('10000K', '3'))).reset_index(drop=True)

    mileages = [one_hot_mileage, labeled_mileage]


    ### 가입경력 ###
    # one-hot
    one_hot_exp = pd.get_dummies(df['가입경력코드'], drop_first = True).reset_index(drop=True)

    # handled-label
    labeled_exp = df['가입경력코드'].apply(lambda x: int(str(x).split('.')[0])).reset_index(drop=True)

    exps = [one_hot_exp, labeled_exp]


    ### 가입금액 ###
    # one-hot
    one_hot_money = pd.get_dummies(df['차량가입금액'], drop_first = True).reset_index(drop=True)

    # handled-label
    labeled_money = df['차량가입금액'].apply(lambda x : int(str(x).replace('미가입','0').replace('5천만원이하','1').replace('1억이하','2').replace('1억이상','3'))).reset_index(drop=True)

    moneys = [one_hot_money, labeled_money]


    ### 고정 전처리들 ###
    one_hot_sex = pd.get_dummies(df['성별'], prefix='성별', drop_first = True)
    one_hot_kor = pd.get_dummies(df['국산차량여부'], prefix='국산', drop_first = True)
    one_hot_rule = pd.get_dummies(df['운전자한정특별약관'], prefix='약관', drop_first = True) # 순서에 의미가 없어 dummy화
    one_hot_cctv = pd.get_dummies(df['영상기록장치특약가입'], prefix='영상기록', drop_first = True)

    combined_df = pd.concat([one_hot_sex, one_hot_kor, one_hot_rule, one_hot_cctv], axis=1).reset_index(drop=True)

    # y
    df_y = df['사고유무'].reset_index(drop=True)

    return ages, NCRs, carAges, carTypes, mileages, exps, moneys, combined_df, df_y


### 모델 실행

In [5]:
# 로지스틱 회귀 함수
def logistic_classifier(X_train, X_test, y_train, y_test):

    # LogisticRegression 초기화
    model = LogisticRegression(random_state = 97, n_jobs = -1)

    # 모델 학습
    model.fit(X_train, y_train)

    # 예측
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1].reshape(-1,1) # 양성 클래스에 대한 확률 추출

    # 기본 평가지표
    auc = roc_auc_score(y_test, y_proba)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1score = f1_score(y_test, y_pred, average='weighted')

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity = tn / (tn+fp)

    # 최적 threshold 값 & 그 때의 FPR 출력
    fper, tper, thresholds = roc_curve(y_test, y_proba)  # thresholds 별 fpr tpr 계산
    optimal_idx = np.argmax(tper - fper)                 # fpr, tpr 간 차이가 가장 클 때의 index 저장
    optimal_fpr = fper[optimal_idx]                      # 해당 index에 위치한 fpr값 저장
    optimal_threshold = thresholds[optimal_idx]

    # 최적 threshold 평가지표
    y_optpred = (y_proba > thresholds[optimal_idx]).astype(int)

    opt_accuracy = accuracy_score(y_test, y_optpred)
    opt_precision = precision_score(y_test, y_optpred)
    opt_recall = recall_score(y_test, y_optpred)
    opt_f1score = f1_score(y_test, y_optpred, average='weighted')

    tn2, fp2, fn2, tp2 = confusion_matrix(y_test, y_optpred).ravel()
    opt_specificity = tn2 / (tn2 + fp2)

    return auc, accuracy, recall, precision, f1score, specificity, opt_accuracy, opt_recall, opt_precision, opt_f1score, opt_specificity


In [6]:
data= []
auc = []
acc = []
prec = []
rec = []
f1 = []
spec = []

optacc = []
optprec = []
optrec = []
optspec = []
optf1 = []

for i in range(30):

    # 8천개를 sampling
    now_df = df.sample(n=8000)

    # 모든 라벨링 경우의 수 생성
    ages, NCRs, carAges, carTypes, mileages, exps, moneys, combined_df, df_y = make_df(now_df)

    count = 0
    for a, age in enumerate(ages, start=1):
        for b, NCR in enumerate(NCRs, start=1):
            for c, carAge in enumerate(carAges, start=1):
                for d, carType in enumerate(carTypes, start=1):
                    for e, mileage in enumerate(mileages, start=1):
                        for f, exp in enumerate(exps, start=1):
                            for g, money in enumerate(moneys, start=1):

                                # 전처리 경우의 수 이름
                                name = str(a) + str(b) + str(c) + str(d) + str(e) + str(f) + str(g)

                                # 전처리 경우의 수를 합친 데이터프레임 생성
                                comb_df = pd.concat([combined_df, age, NCR, carAge, carType, mileage,exp, money], axis=1)

                                # smote 전 column명 string으로 변환
                                comb_df.columns = comb_df.columns.astype(str)

                                # smote 전 data type 변환
                                bool_mask = comb_df.dtypes == np.bool_
                                bool_cols = comb_df.columns[bool_mask].tolist()

                                for col_name in bool_cols:
                                    comb_df[col_name] = comb_df[col_name].astype(int)

                                # train test split
                                X_train, X_test, y_train, y_test = train_test_split(comb_df, df_y, test_size = 0.2)

                                # SMOTE를 이용하여 증강
                                smote = SMOTE(sampling_strategy='auto')
                                X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

                                # 평가지표
                                aucc, accuracy, recall, precision, f1score, specificity, opt_accuracy, opt_recall, opt_precision, opt_f1score, opt_specificity = logistic_classifier(X_train_resampled, X_test, y_train_resampled, y_test)

                                data.append(name)
                                auc.append(aucc)
                                acc.append(accuracy)
                                prec.append(precision)
                                rec.append(recall)
                                f1.append(f1score)
                                spec.append(specificity)

                                optacc.append(opt_accuracy)
                                optprec.append(opt_precision)
                                optrec.append(opt_recall)
                                optf1.append(opt_f1score)
                                optspec.append(opt_specificity)

                                # count += 1
                                # if((count+1)%16 == 0) :
                                #     print(f"{count/128 * 100}% 완료")

    print("전체 ", i/30 * 100, "% 완료")

# 결과 DataFrame 생성
results = pd.DataFrame({
    'data' : data,
    'auc' : auc,
    'accuracy' : acc,
    'precision' : prec,
    'recall' : rec,
    "f1-score" : f1,
    "specificity" : spec,
    'opt_accuracy' : optacc,
    'opt_precision' : optprec,
    'opt_recall' : optrec,
    "opt_f1-score" : optf1,
    "opt_specificity" : optspec
})

# DataFrame 저장
results.sort_values(by=["data"]).to_csv("/content/drive/MyDrive/기계학습의이해/LR/LR최종/결과/LR_전처리.csv", index = False)

전체  0.0 % 완료
전체  3.3333333333333335 % 완료
전체  6.666666666666667 % 완료
전체  10.0 % 완료
전체  13.333333333333334 % 완료
전체  16.666666666666664 % 완료
전체  20.0 % 완료
전체  23.333333333333332 % 완료
전체  26.666666666666668 % 완료
전체  30.0 % 완료
전체  33.33333333333333 % 완료
전체  36.666666666666664 % 완료
전체  40.0 % 완료
전체  43.333333333333336 % 완료
전체  46.666666666666664 % 완료
전체  50.0 % 완료
전체  53.333333333333336 % 완료
전체  56.666666666666664 % 완료
전체  60.0 % 완료
전체  63.33333333333333 % 완료
전체  66.66666666666666 % 완료
전체  70.0 % 완료
전체  73.33333333333333 % 완료
전체  76.66666666666667 % 완료
전체  80.0 % 완료
전체  83.33333333333334 % 완료
전체  86.66666666666667 % 완료
전체  90.0 % 완료
전체  93.33333333333333 % 완료
전체  96.66666666666667 % 완료


### 랭킹

In [7]:
df_result = results.groupby('data').mean().reset_index()

In [8]:
# accuracy 기준 결과 내림차순
df_result.sort_values(by=["auc"], ascending = False).head(5)

## 전처리 데이터 : 1121211

Unnamed: 0,data,auc,accuracy,precision,recall,f1-score,specificity,opt_accuracy,opt_precision,opt_recall,opt_f1-score,opt_specificity
84,2121211,0.57489,0.797417,0.186678,0.138465,0.784594,0.902917,0.542958,0.173494,0.602085,0.603966,0.533176
92,2122211,0.569554,0.775813,0.183835,0.18048,0.77484,0.871507,0.529354,0.170629,0.60992,0.590222,0.515941
28,1122211,0.568999,0.787396,0.180809,0.155846,0.780569,0.887861,0.5405,0.169755,0.593802,0.604972,0.532125
80,2121111,0.568681,0.811792,0.177667,0.104492,0.790178,0.923669,0.503875,0.167217,0.646169,0.567265,0.481713
56,1222111,0.567976,0.797187,0.162743,0.118289,0.783256,0.904165,0.500896,0.165316,0.653078,0.566712,0.477346


In [9]:
# accuracy 기준 결과 내림차순
df_result.sort_values(by=["accuracy"], ascending = False).head(5)

Unnamed: 0,data,auc,accuracy,precision,recall,f1-score,specificity,opt_accuracy,opt_precision,opt_recall,opt_f1-score,opt_specificity
55,1221222,0.533606,0.815979,0.152227,0.077128,0.789184,0.932178,0.508333,0.157325,0.579455,0.560523,0.49787
64,2111111,0.56563,0.815646,0.190449,0.103459,0.79172,0.929432,0.51475,0.165824,0.619606,0.578401,0.497382
16,1121111,0.564423,0.815438,0.170359,0.089157,0.789996,0.930764,0.497083,0.16522,0.647419,0.561364,0.472935
0,1111111,0.565091,0.815021,0.175707,0.09086,0.78898,0.931425,0.489833,0.165887,0.655203,0.551325,0.463028
32,1211111,0.564265,0.814458,0.163179,0.082349,0.787465,0.932065,0.4905,0.166229,0.661238,0.553361,0.462772


In [10]:
# recall 기준 결과 내림차순
df_result.sort_values(by=["recall"], ascending = False).head(5)

Unnamed: 0,data,auc,accuracy,precision,recall,f1-score,specificity,opt_accuracy,opt_precision,opt_recall,opt_f1-score,opt_specificity
93,2122212,0.546998,0.756729,0.172983,0.202911,0.764865,0.844968,0.561188,0.16823,0.53161,0.617908,0.565542
79,2112222,0.545115,0.761437,0.176648,0.191506,0.765369,0.854453,0.539146,0.166718,0.552165,0.593424,0.536023
95,2122222,0.543284,0.770479,0.177792,0.188,0.773143,0.862552,0.603042,0.168396,0.468708,0.651536,0.623116
77,2112212,0.554578,0.766687,0.173993,0.181832,0.768694,0.861043,0.551208,0.170557,0.558508,0.605811,0.550119
92,2122211,0.569554,0.775813,0.183835,0.18048,0.77484,0.871507,0.529354,0.170629,0.60992,0.590222,0.515941


In [11]:
# precision 기준 결과 내림차순
df_result.sort_values(by=["precision"], ascending = False).head(5)

Unnamed: 0,data,auc,accuracy,precision,recall,f1-score,specificity,opt_accuracy,opt_precision,opt_recall,opt_f1-score,opt_specificity
64,2111111,0.56563,0.815646,0.190449,0.103459,0.79172,0.929432,0.51475,0.165824,0.619606,0.578401,0.497382
12,1112211,0.567754,0.793688,0.187944,0.145076,0.78229,0.898567,0.559667,0.175453,0.564981,0.615398,0.557725
68,2111211,0.567144,0.805292,0.187474,0.123225,0.787946,0.914193,0.511542,0.169024,0.627316,0.568663,0.493114
84,2121211,0.57489,0.797417,0.186678,0.138465,0.784594,0.902917,0.542958,0.173494,0.602085,0.603966,0.533176
4,1111211,0.566772,0.812562,0.185992,0.107004,0.790585,0.924921,0.521646,0.16846,0.613883,0.583446,0.507669




---



최고 전처리셋

1트 : 1121211

2트 : 1111211

3트 : 2121211

4트 : 2111211

5트 : 2121211

=> 경향성 + 횟수 = 2111211
