### 패키지 설치

In [None]:
# 연산 처리  패키지
import pandas as pd
import numpy as np
import math

# 전처리 패키지
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE, SMOTENC

# 모델 패키지
from sklearn.linear_model import LogisticRegression, LinearRegression

# 평가지표 패키지
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score, roc_curve, mean_squared_error, precision_score, recall_score

# 시각화 패키지
from matplotlib import pyplot as plt
import seaborn as sns

# 그래프 설정
%matplotlib inline

# 경고 메세지 무시
import warnings
warnings.filterwarnings('ignore')

### 데이터 불러오기

In [None]:
# 코랩 mount
from google.colab import drive
drive.mount('/content/drive')

# 데이터 읽어오기 (고정 전처리한 데이터)
df = pd.read_csv("/content/drive/MyDrive/기계학습의이해/Dataset/base_process.csv", encoding = "euc-kr", engine='python')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df.head()

Unnamed: 0,연령대,성별,국산차량여부,직전3년간사고건수,차량경과년수,차종,운전자한정특별약관,가입경력코드,차량가입금액,영상기록장치특약가입,마일리지약정거리,사고유무
0,10.0,여성,국산,C,10년이하,중형,가족한정(형제자매제외),8.0,5천만원이하,미가입,15000K,1
1,10.0,여성,국산,D,5년이하,다목적2종,누구나(기본),8.0,5천만원이하,가입,미가입,1
2,10.0,여성,국산,D,10년이상,중형,가족한정(형제자매제외),8.0,미가입,미가입,미가입,0
3,10.0,여성,국산,N,5년이하,소형B,가족한정(형제자매제외),2.0,5천만원이하,가입,15000K,0
4,10.0,여성,국산,N,5년이하,소형B,가족한정(형제자매제외),3.0,5천만원이하,가입,15000K,0


### 라벨링 코드

> **컬럼네임 해석**

age, NCR, carAge, carType, mileage,exp, money 순서대로

1: one-hot

2: handled label

ex) 111112 은 전부 one-hot이고, money만 handled label로 전처리


In [None]:
def make_df(df):

    ### 연령대 ###
    # one-hot
    one_hot_age = pd.get_dummies(df['연령대'], prefix='연령대', drop_first = True).reset_index(drop=True)

    # handled-label
    labeled_age = df['연령대'].apply(lambda x: int(str(x).replace(',', '').split('.')[0])).reset_index(drop=True)

    ages = [one_hot_age, labeled_age]


    ### 직전 3년간 사고 건수 ###
    # one-hot
    one_hot_NCR = pd.get_dummies(df['직전3년간사고건수'], prefix='NCR', drop_first = True).reset_index(drop=True)

    # handled-label
    # 신규 0, 무사고 1, 1회 2, 2회 3, 3회 4, 결측치 0 은 무사고로 편입
    labeled_NCR_tmp = df['직전3년간사고건수'].apply(lambda x: str(x).replace('0', 'N'))
    labeled_NCR = labeled_NCR_tmp.apply(lambda x: int(str(x).replace('Z', '0').replace('N', '1').replace('D', '2').replace('C', '3').replace('B', '4'))).reset_index(drop=True)

    NCRs = [one_hot_NCR, labeled_NCR]


    ### 차량 경과년수 ###
    # one-hot
    one_hot_carAge = pd.get_dummies(df['차량경과년수'], prefix='차량경과년수', drop_first = True).reset_index(drop=True)

    # handled-label
    labeled_carAge = df['차량경과년수'].apply(lambda x: int(str(x).replace('신차', '0').replace('5년이하', '1').replace('10년이하', '2').replace('10년이상', '3'))).reset_index(drop=True)

    carAges = [one_hot_carAge, labeled_carAge]


    ### 차종 ###
    # one-hot
    one_hot_carType = pd.get_dummies(df['차종'], prefix='차종', drop_first = True).reset_index(drop=True)

    # handled-label
    labeled_carType = df['차종'].apply(lambda x: int(str(x).replace('소형A', '0').replace('소형B', '1').replace('중형', '2').replace('대형', '3').replace('다목적1종','4').replace('다목적2종','5').replace('기타','6'))).reset_index(drop=True)

    carTypes = [one_hot_carType, labeled_carType]


    ### 마일리지 ###
    # one-hot
    one_hot_mileage = pd.get_dummies(df['마일리지약정거리'], prefix='마일리지', drop_first = True).reset_index(drop=True)

    # handled-label
    labeled_mileage_1 = df['마일리지약정거리'].apply(lambda x: str(x).replace('12000K','4').replace('15000K','5').replace('미가입','6'))
    labeled_mileage = labeled_mileage_1.apply(lambda x: int(str(x).replace('3000K', '0').replace('5000K', '1').replace('7000K', '2').replace('10000K', '3'))).reset_index(drop=True)

    mileages = [one_hot_mileage, labeled_mileage]


    ### 가입경력 ###
    # one-hot
    one_hot_exp = pd.get_dummies(df['가입경력코드'], drop_first = True).reset_index(drop=True)

    # handled-label
    labeled_exp = df['가입경력코드'].apply(lambda x: int(str(x).split('.')[0])).reset_index(drop=True)

    exps = [one_hot_exp, labeled_exp]


    ### 가입금액 ###
    # one-hot
    one_hot_money = pd.get_dummies(df['차량가입금액'], drop_first = True).reset_index(drop=True)

    # handled-label
    labeled_money = df['차량가입금액'].apply(lambda x : int(str(x).replace('미가입','0').replace('5천만원이하','1').replace('1억이하','2').replace('1억이상','3'))).reset_index(drop=True)

    moneys = [one_hot_money, labeled_money]


    ### 고정 전처리들 ###
    one_hot_sex = pd.get_dummies(df['성별'], prefix='성별', drop_first = True)
    one_hot_kor = pd.get_dummies(df['국산차량여부'], prefix='국산', drop_first = True)
    one_hot_rule = pd.get_dummies(df['운전자한정특별약관'], prefix='약관', drop_first = True) # 순서에 의미가 없어 dummy화
    one_hot_cctv = pd.get_dummies(df['영상기록장치특약가입'], prefix='영상기록', drop_first = True)

    combined_df = pd.concat([one_hot_sex, one_hot_kor, one_hot_rule, one_hot_cctv], axis=1).reset_index(drop=True)

    # y
    df_y = df['사고유무'].reset_index(drop=True)

    return ages, NCRs, carAges, carTypes, mileages, exps, moneys, combined_df, df_y


### 모델 실행

In [None]:
# 로지스틱 회귀 함수
def logistic_classifier(X_train, X_test, y_train, y_test):

    # LogisticRegression 초기화
    model = LogisticRegression()

    # 모델 학습
    model.fit(X_train, y_train)

    # 예측
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1].reshape(-1,1) # 양성 클래스에 대한 확률 추출

    # 최적 threshold 값 & 그 때의 FPR 출력
    fper, tper, thresholds = roc_curve(y_test, y_proba)  # thresholds 별 fpr tpr 계산
    optimal_idx = np.argmax(tper - fper)                 # fpr, tpr 간 차이가 가장 클 때의 index 저장
    optimal_fpr = fper[optimal_idx]                      # 해당 index에 위치한 fpr값 저장
    optimal_threshold = thresholds[optimal_idx]

    # 최적 threshold 평가지표
    y_optpred = (y_proba > thresholds[optimal_idx]).astype(int)

    accuracy = accuracy_score(y_test, y_optpred)
    precision = precision_score(y_test, y_optpred)
    recall = recall_score(y_test, y_optpred)


    return [accuracy, precision, recall]


In [None]:
data= []
acc = []
prec = []
rec = []


for i in range(30):

    # 8천개를 sampling
    now_df = df.sample(n=8000)

    # 모든 라벨링 경우의 수 생성
    ages, NCRs, carAges, carTypes, mileages, exps, moneys, combined_df, df_y = make_df(now_df)

    count = 0
    for a, age in enumerate(ages, start=1):
        for b, NCR in enumerate(NCRs, start=1):
            for c, carAge in enumerate(carAges, start=1):
                for d, carType in enumerate(carTypes, start=1):
                    for e, mileage in enumerate(mileages, start=1):
                        for f, exp in enumerate(exps, start=1):
                            for g, money in enumerate(moneys, start=1):

                                # 전처리 경우의 수 이름
                                name = str(a) + str(b) + str(c) + str(d) + str(e) + str(f) + str(g)

                                # 전처리 경우의 수를 합친 데이터프레임 생성
                                comb_df = pd.concat([combined_df, age, NCR, carAge, carType, mileage,exp, money], axis=1)

                                # smote 전 column명 string으로 변환
                                comb_df.columns = comb_df.columns.astype(str)

                                # smote 전 data type 변환
                                bool_mask = comb_df.dtypes == np.bool_
                                bool_cols = comb_df.columns[bool_mask].tolist()

                                for col_name in bool_cols:
                                    comb_df[col_name] = comb_df[col_name].astype(int)

                                # train test split
                                X_train, X_test, y_train, y_test = train_test_split(comb_df, df_y, test_size = 0.2)

                                # SMOTE를 이용하여 증강
                                smote = SMOTE(sampling_strategy='auto')
                                X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

                                # 평가지표
                                accuracy, precision, recall = logistic_classifier(X_train_resampled, X_test, y_train_resampled, y_test)

                                data.append(name)
                                acc.append(accuracy)
                                prec.append(precision)
                                rec.append(recall)

                                count += 1
                                if((count+1)%16 == 0) :
                                    print(f"{count/128}% 완료")

    print("전체 ", i/30, "% 완료")

# 결과 DataFrame 생성
results = pd.DataFrame({
    'data' : data,
    'accuracy' : acc,
    'precision' : prec,
    'recall' : rec
})
df_result = results.groupby('data').mean().reset_index()


# DataFrame 저장
results.to_csv("/content/drive/MyDrive/기계학습의이해/LR/LR_결과/LR전처리_결과.csv", index = False)
df_result.to_csv("/content/drive/MyDrive/기계학습의이해/LR/LR_결과/LR전처리_요약.csv", index = False)


0.0078125% 완료
0.015625% 완료
0.0234375% 완료
0.03125% 완료
0.0390625% 완료
0.046875% 완료
0.0546875% 완료
0.0625% 완료
0.0703125% 완료
0.078125% 완료
0.0859375% 완료
0.09375% 완료
0.1015625% 완료
0.109375% 완료
0.1171875% 완료
0.125% 완료
0.1328125% 완료
0.140625% 완료
0.1484375% 완료
0.15625% 완료
0.1640625% 완료
0.171875% 완료
0.1796875% 완료
0.1875% 완료
0.1953125% 완료
0.203125% 완료
0.2109375% 완료
0.21875% 완료
0.2265625% 완료
0.234375% 완료
0.2421875% 완료
0.25% 완료
0.2578125% 완료
0.265625% 완료
0.2734375% 완료
0.28125% 완료
0.2890625% 완료
0.296875% 완료
0.3046875% 완료
0.3125% 완료
0.3203125% 완료
0.328125% 완료
0.3359375% 완료
0.34375% 완료
0.3515625% 완료
0.359375% 완료
0.3671875% 완료
0.375% 완료
0.3828125% 완료
0.390625% 완료
0.3984375% 완료
0.40625% 완료
0.4140625% 완료
0.421875% 완료
0.4296875% 완료
0.4375% 완료
0.4453125% 완료
0.453125% 완료
0.4609375% 완료
0.46875% 완료
0.4765625% 완료
0.484375% 완료
0.4921875% 완료
0.5% 완료
0.5078125% 완료
0.515625% 완료
0.5234375% 완료
0.53125% 완료
0.5390625% 완료
0.546875% 완료
0.5546875% 완료
0.5625% 완료
0.5703125% 완료
0.578125% 완료
0.5859375% 완료
0.59375% 완료
0.6015625

### 랭킹

In [None]:
# accuracy 기준 결과 내림차순
df_result.sort_values(by=["accuracy"], ascending = False).head(5)

Unnamed: 0,data,accuracy,precision,recall
29,1122212,0.591042,0.20808,0.493731
79,2112222,0.590104,0.200465,0.483793
95,2122222,0.584063,0.202336,0.472579
31,1122222,0.560479,0.195154,0.513375
23,1121222,0.559208,0.19778,0.532767
28,1122211,0.556187,0.212207,0.587579
91,2122122,0.549271,0.19326,0.516064
87,2121222,0.548708,0.20156,0.550904
37,1211212,0.546479,0.207811,0.581039
77,2112212,0.545167,0.198266,0.556626


In [None]:
# recall 기준 결과 내림차순
df_result.sort_values(by=["recall"], ascending = False).head(5)

Unnamed: 0,data,accuracy,precision,recall
104,2212111,0.450021,0.194599,0.718961
59,1222122,0.413458,0.1845,0.711446
64,2111111,0.468542,0.198411,0.708343
32,1211111,0.469792,0.199011,0.707413
72,2112111,0.464125,0.197746,0.703501
40,1212111,0.470458,0.198182,0.703479
48,1221111,0.481667,0.20383,0.702066
98,2211121,0.450208,0.190774,0.700646
42,1212121,0.455458,0.190747,0.698599
115,2221122,0.430583,0.184085,0.696036


In [None]:
# precision 기준 결과 내림차순
df_result.sort_values(by=["precision"], ascending = False).head(5)

Unnamed: 0,data,accuracy,precision,recall
28,1122211,0.556187,0.212207,0.587579
29,1122212,0.591042,0.20808,0.493731
37,1211212,0.546479,0.207811,0.581039
76,2112211,0.515479,0.206431,0.630886
69,2111212,0.538167,0.205346,0.583984
4,1111211,0.518979,0.205067,0.649867
86,2121221,0.536708,0.204752,0.602843
20,1121211,0.534125,0.204198,0.614941
65,2111112,0.516021,0.204036,0.614193
92,2122211,0.499063,0.203945,0.658084


In [None]:
# charting
chart_list = []

# accuracy top count
chart_list.extend(df_result.sort_values(by='accuracy', ascending=False).head(5)['data'].tolist())
chart_list.extend(df_result.sort_values(by='accuracy', ascending=False).head(10)['data'].tolist())
chart_list.extend(df_result.sort_values(by='accuracy', ascending=False).head(15)['data'].tolist())
chart_list.extend(df_result.sort_values(by='accuracy', ascending=False).head(20)['data'].tolist())
chart_list.extend(df_result.sort_values(by='accuracy', ascending=False).head(30)['data'].tolist())

# precision top count
chart_list.extend(df_result.sort_values(by='precision', ascending=False).head(5)['data'].tolist())
chart_list.extend(df_result.sort_values(by='precision', ascending=False).head(10)['data'].tolist())
chart_list.extend(df_result.sort_values(by='precision', ascending=False).head(15)['data'].tolist())
chart_list.extend(df_result.sort_values(by='precision', ascending=False).head(20)['data'].tolist())
chart_list.extend(df_result.sort_values(by='precision', ascending=False).head(30)['data'].tolist())

# recall top count
chart_list.extend(df_result.sort_values(by='recall', ascending=False).head(5)['data'].tolist())
chart_list.extend(df_result.sort_values(by='recall', ascending=False).head(10)['data'].tolist())
chart_list.extend(df_result.sort_values(by='recall', ascending=False).head(15)['data'].tolist())
chart_list.extend(df_result.sort_values(by='recall', ascending=False).head(20)['data'].tolist())
chart_list.extend(df_result.sort_values(by='recall', ascending=False).head(30)['data'].tolist())

# count 및 rate 계산
count= pd.DataFrame(chart_list)
ranking = pd.DataFrame(count.value_counts()).reset_index()
ranking.columns = ['data', 'count']
ranking['rate'] = round(ranking['count']/15, 3)

ranking.head(5)

Unnamed: 0,data,count,rate
0,1122212,10,0.667
1,1211212,9,0.6
2,1122211,9,0.6
3,2111212,8,0.533
4,2121221,7,0.467




---



최고 전처리셋 : 1122212