## 라이브러리 호출

In [1]:
# 연산 처리  패키지
import pandas as pd
import numpy as np

# 시각화 패키지
from matplotlib import pyplot as plt
import seaborn as sns

# 통계분석 패키지
import statsmodels.api as sm

# 그래프 설정
%matplotlib inline

# 경고 메세지 무시
import warnings
warnings.filterwarnings('ignore')

## Data 불러오기

In [2]:
# 코랩 mount
from google.colab import drive
drive.mount('/content/drive')

# 데이터 읽어오기
df1 = pd.read_csv("/content/drive/MyDrive/Dataset/(자동차보험) 고객별 사고 발생률 예측 모델링_1.csv", encoding = "cp949", engine='python')
df2 = pd.read_csv("/content/drive/MyDrive/Dataset/(자동차보험) 고객별 사고 발생률 예측 모델링_2.csv", encoding = "cp949", engine='python')
df3 = pd.read_csv("/content/drive/MyDrive/Dataset/(자동차보험) 고객별 사고 발생률 예측 모델링_3.csv", encoding = "cp949", engine='python')

Mounted at /content/drive


In [11]:
# 데이터 이상 없는 df의 column 확인
col = df2.columns

# 오류로 생성된 column 삭제
df1 = df1[col]


#df1의 파일상 null 행 삭제
null_idx = df1[df1["차종"].isnull()].index.tolist()
df1 = df1.drop(index = null_idx)


# 데이터 프레임 합치기
df = pd.concat([df1, df2, df3], ignore_index = True)
df

Unnamed: 0,ZCPRLCLCD,ZINSRDAVL,ZIOSEXCD,ZDPRODSCD,NCR,ZCARPSGVL,차종,ZDRVLISCD,ZDRVLISCD___T,ZENTCARCD,ZCARISDAM,ZIMAGERVL,마일리지약정거리,YUHO,SAGO
0,A10,0.0,1.0,,B,신차,기타,5.0,가족및형제자매한정,8.0,5천만원이하,가입,15000K,1,0.0
1,A10,0.0,1.0,,N,신차,기타,4.0,기명피보험자1인한정,8.0,미가입,미가입,15000K,1,0.0
2,A10,0.0,1.0,,N,신차,기타,2.0,가족한정(형제자매제외),5.0,5천만원이하,가입,미가입,1,3.0
3,A10,0.0,1.0,1.0,N,10년이상,중형,2.0,가족한정(형제자매제외),2.0,5천만원이하,미가입,15000K,0,0.0
4,A10,0.0,1.0,1.0,Z,5년이하,다목적2종,2.0,가족한정(형제자매제외),1.0,미가입,미가입,미가입,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267772,A10,40.0,2.0,1.0,C,10년이상,중형,1.0,누구나(기본),8.0,미가입,미가입,7000K,0,0
267773,A10,40.0,2.0,1.0,C,5년이하,소형A,12.0,부부 및 자녀한정,8.0,5천만원이하,가입,15000K,2,0
267774,A10,40.0,2.0,1.0,C,10년이하,소형A,4.0,기명피보험자1인한정,8.0,미가입,미가입,미가입,6,1
267775,A10,40.0,2.0,1.0,C,5년이하,소형B,1.0,누구나(기본),7.0,5천만원이하,가입,미가입,0,0


In [12]:
# 수치형인데 object인 column 형변환
df['YUHO'] = df['YUHO'].str.replace(',', '').astype(int)
df['SAGO'] = df['SAGO'].apply(lambda x: int(float(str(x).replace(',', '').split('.')[0])))
df['SAGO'] = df['SAGO'].replace(',', '').astype(int)

# column명 한글로 변환
df = df.rename(columns = {"ZINSRDAVL": "연령대", "ZIOSEXCD": "성별",
                        "ZDPRODSCD": "국산차량여부", "NCR": "직전3년간사고건수",
                        "ZCARPSGVL": "차량경과년수", "ZDRVLISCD___T": "운전자한정특별약관",
                        "ZENTCARCD": "가입경력코드", "ZCARISDAM": "차량가입금액",
                        "ZIMAGERVL": "영상기록장치특약가입", "YUHO": "유효대수",
                        "SAGO" : "사고건수"})

# 사용하지 않는 column drop
df = df.drop(columns = ['ZCPRLCLCD', 'ZDRVLISCD'])
df

Unnamed: 0,연령대,성별,국산차량여부,직전3년간사고건수,차량경과년수,차종,운전자한정특별약관,가입경력코드,차량가입금액,영상기록장치특약가입,마일리지약정거리,유효대수,사고건수
0,0.0,1.0,,B,신차,기타,가족및형제자매한정,8.0,5천만원이하,가입,15000K,1,0
1,0.0,1.0,,N,신차,기타,기명피보험자1인한정,8.0,미가입,미가입,15000K,1,0
2,0.0,1.0,,N,신차,기타,가족한정(형제자매제외),5.0,5천만원이하,가입,미가입,1,3
3,0.0,1.0,1.0,N,10년이상,중형,가족한정(형제자매제외),2.0,5천만원이하,미가입,15000K,0,0
4,0.0,1.0,1.0,Z,5년이하,다목적2종,가족한정(형제자매제외),1.0,미가입,미가입,미가입,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
267772,40.0,2.0,1.0,C,10년이상,중형,누구나(기본),8.0,미가입,미가입,7000K,0,0
267773,40.0,2.0,1.0,C,5년이하,소형A,부부 및 자녀한정,8.0,5천만원이하,가입,15000K,2,0
267774,40.0,2.0,1.0,C,10년이하,소형A,기명피보험자1인한정,8.0,미가입,미가입,미가입,6,1
267775,40.0,2.0,1.0,C,5년이하,소형B,누구나(기본),7.0,5천만원이하,가입,미가입,0,0


In [14]:
# 사고율 / 사고유무 df 생성
df['사고유무'] = df.apply(lambda row: 0 if row['사고건수'] == 0 else 1, axis = 1)
df_yn = df.drop(columns = ['사고건수', '유효대수'])

In [15]:
df_yn.head(5)

Unnamed: 0,연령대,성별,국산차량여부,직전3년간사고건수,차량경과년수,차종,운전자한정특별약관,가입경력코드,차량가입금액,영상기록장치특약가입,마일리지약정거리,사고유무
0,0.0,1.0,,B,신차,기타,가족및형제자매한정,8.0,5천만원이하,가입,15000K,0
1,0.0,1.0,,N,신차,기타,기명피보험자1인한정,8.0,미가입,미가입,15000K,0
2,0.0,1.0,,N,신차,기타,가족한정(형제자매제외),5.0,5천만원이하,가입,미가입,1
3,0.0,1.0,1.0,N,10년이상,중형,가족한정(형제자매제외),2.0,5천만원이하,미가입,15000K,0
4,0.0,1.0,1.0,Z,5년이하,다목적2종,가족한정(형제자매제외),1.0,미가입,미가입,미가입,0


## Model 함수 정의

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score, roc_auc_score, roc_curve, mean_squared_error, precision_score, recall_score
import math

#### 사고유무

In [None]:
# 로지스틱 회귀 함수
def logistic_classifier(X_train, X_test, y_train, y_test, random_state):
    # LogisticRegression 초기화
    model = LogisticRegression(random_state=random_state)

    # 모델 학습
    model.fit(X_train, y_train)

    # 예측
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1].reshape(-1,1) # 양성 클래스에 대한 확률 추출

    # 평가지표 출력
    accuracy = accuracy_score(y_test, y_pred)
    # report = classification_report(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')  # F1 점수 계산
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)


    auc = roc_auc_score(y_test, y_proba)

    # print(f"정확도 : {accuracy} \n")
    # print("분류 보고서 :\n", report, "\n")
    # print(f"F1 점수 : {f1} \n")
    # print(f"auc : {auc} \n")

    # FPR, TPR, 임계값
    fper, tper, thresholds = roc_curve(y_test, y_proba)

    # threshold 최대값의 인덱스, np.argmax()
    optimal_idx = np.argmax(tper - fper)
    optimal_threshold = thresholds[optimal_idx]
    optimal_fpr = fper[optimal_idx]

    # print('optimal fpr:', optimal_fpr, ', threshold:', optimal_threshold, '\n\n\n')

    return [accuracy, precision, recall, f1, auc, optimal_fpr]


    # print(f"정확도 : {accuracy} \n")
    # print("분류 보고서 :\n", report, "\n")
    # print(f"F1 점수 : {f1} \n")
    # print(f"auc : {auc} \n")

    # fper, tper, thresholds = roc_curve(y_test, y_proba)
    # plot_roc_curve(fper, tper)

    # return model, y_pred

## label encoder

### 컬럼별 인코더

In [None]:

def age_encoder(raw_df, labeling_param):
    if labeling_param == "one-hot":
        pass
    elif labeling_param == "label_library":
        pass
    elif labeling_param == "label_handled":
        pass
    elif labeling_param == "label_numeric":
        pass
    else:
        raise Exception("없는 라벨 파라미터")

def NCR_encoder(raw_df, labeling_param):
    if labeling_param == "one-hot":
        pass
    elif labeling_param == "label_library":
        pass
    elif labeling_param == "label_handled":
        pass
    elif labeling_param == "label_numeric":
        pass
    else:
        raise Exception("없는 라벨 파라미터")

def car_age_encoder(raw_df, labeling_param):
    if labeling_param == "one-hot":
        pass
    elif labeling_param == "label_library":
        pass
    elif labeling_param == "label_handled":
        pass
    elif labeling_param == "label_numeric":
        pass
    else:
        raise Exception("없는 라벨 파라미터")

def member_experice_encoder(raw_df, labeling_param):
    if labeling_param == "one-hot":
        pass
    elif labeling_param == "label_library":
        pass
    elif labeling_param == "label_handled":
        pass
    elif labeling_param == "label_numeric":
        pass
    else:
        raise Exception("없는 라벨 파라미터")

def car_amount_encoder(raw_df, labeling_param):
    if labeling_param == "one-hot":
        pass
    elif labeling_param == "label_library":
        pass
    elif labeling_param == "label_handled":
        pass
    elif labeling_param == "label_numeric":
        pass
    else:
        raise Exception("없는 라벨 파라미터")
def sex_encoder(raw_df, labeling_param):
    if labeling_param == "one-hot":
        pass
    else:
        raise Exception("없는 라벨 파라미터")
def domestic_encoder(raw_df, labeling_param):
    if labeling_param == "one-hot":
        pass
    else:
        raise Exception("없는 라벨 파라미터")

def terms_encoder(raw_df, labeling_param):
    if labeling_param == "one-hot":
        pass
    else:
        raise Exception("없는 라벨 파라미터")

def blackbox_encoder(raw_df, labeling_param):
    if labeling_param == "one-hot":
        pass
    else:
        raise Exception("없는 라벨 파라미터")

## 전처리 Control

### Case 나누기

In [None]:
### CASE 별 함수 정의 ###
import pandas as pd


# 1. 0~10대만 지우기
def remove_age(df, remove_age_list, flag):

    if flag == True:
      # '연령대' 열에서 remove_age_list 안에 있는 값을 제거
      age_removed_df = df[~df['연령대'].isin(remove_age_list)]

      return age_removed_df

    else:
      return df


# 2. 연령, 성별 빼고 중복된 정보를 제거
def remove_columns_and_duplicates(df, remove_column_list, flag):

    if flag == True:
      # '연령'과 '성별' 열 제거

        column_list = df.columns.tolist()
        column_list = [column for column in column_list if column not in remove_column_list]
        print(column_list)


        # 중복된 열 제거
        removed_df = df.sort_values(by='가입경력코드', ascending=False).drop_duplicates(subset=column_list, keep='first')

        return removed_df

    else:
        return df


# 3. NCR 0을 N으로 합치기 혹은 날리기
def replace_NCR_nan(df, flag):

    if flag == 0:
      filtered_df = df.copy()
      filtered_df['직전3년간사고건수'] = df['직전3년간사고건수'].replace('결측치', 0)

    else:
      filtered_df = df[df['직전3년간사고건수'] != '결측치']

    return filtered_df


# 4. 국산코드 결측치 -> 날리기만 실행
def drop_nan_prod(df):

    return df.dropna(subset=['국산차량여부'], inplace=False)


# 5. 사고율이나 유효대수를 SAGO quatile 기준으로 날리기.
def remove_SAGO_outliers(df, q):

    trimmed_df = df[df['사고건수'] <= df['사고건수'].quantile(q)]

    return trimmed_df


# 6. 임직원 한정 제거
def remove_executives(df, flag):

    if flag == True:
        df_filtered = df[df['운전자한정특별약관'] != '임직원한정']
        return df_filtered

    else:
        return df


# 7. 사고율에서 유효 0인 경우 유효 1로 잡기 또는 전부 날리기.
def make_sago_rate(df, value):

    new_df = df.copy()

    if value == 0:
        # new_df['사고율'] = new_df.apply(lambda row: 0 if row['유효대수'] == 0 else row['사고건수'] / row['유효대수'], axis=1)
        new_df['유효대수'] += 0.5
        new_df['사고율'] = new_df['사고건수'] / new_df['유효대수']

    elif value == 1:
        new_df.loc[new_df['유효대수'] == 0, '유효대수'] = 0.5
        new_df['사고율'] = new_df['사고건수'] / new_df['유효대수']

    else:
        new_df = df[df['유효대수'] != 0].copy()
        new_df['사고율'] = new_df['사고건수'] / new_df['유효대수']

    return new_df


In [None]:
remove_age_list = [0, 10]
remove_column_list = ['연령대', '성별', '가입경력코드']
quantile_list = [0.70,0.71, 0.86, 0.91,0.95, 0.97, 0.99, 1]

## 코드실행 부분

### 모델 가져오는 부분.

In [None]:
def get_df_columns(model_name):
    if model_name == "logi":
        columns = ["accuracy", "precision", "recall", "f1", "auc", "optimal_fpr"]
        return columns
    else:
        raise Exception("없는 모델 이름")

def model_result(other_params, X_train, X_test, y_train, y_test):
    if oher_params["model_name"] == "logi":
        results = logistic_classifier(X_train, X_test, y_train, y_test, random_state)
        return results
    else:
        raise Exception("없는 모델 이름")

### 전처리, label 인코딩 처리 가져오는 부분

In [None]:
remove_age_list = [0, 10]
remove_column_list = ['연령대', '성별', '가입경력코드']


def make_preprocessed_dataframe(raw_df, preprocessing_param):
    dfa = remove_age(raw_df, remove_age_list, preprocessing_param["연령대제거"])
    dfb = remove_columns_and_duplicates(dfa, remove_column_list, preprocessing_param["연령성별가입기간 제외 후 중복제거"])
    dfc = replace_NCR_nan(dfb, preprocessing_param["직전3년간사고건수"])
    dfd = drop_nan_prod(dfc)
    dfe = remove_SAGO_outliers(dfd, quantile_list[preprocessing_param["유효대수이상치제거"]])
    dff = remove_executives(dfe, preprocessing_param["임직원한정제거"])

    return dff


def make_labeled_dataframe(raw_df, labeling_param):
    new_df = raw_df.copy()
    new_df["연령대"] = age_encoder(raw_df, labeling_param["연령대"])
    new_df["직전3년간사고건수"] = NCR_encoder(raw_df, labeling_param["직전3년간사고건수"])
    new_df["차량경과년수"] = car_age_encoder(raw_df, labeling_param["차량경과년수"])
    new_df["가입경력코드"] = member_experice_encoder(raw_df, labeling_param["가입경력코드"])
    new_df["차량가입금액"] = car_amount_encoder(raw_df, labeling_param["차량가입금액"])

    new_df["성별"] = sex_encoder(raw_df, labeling_param["성별"])
    new_df["국산차량여부"] = domestic_encoder(raw_df, labeling_param["국산차량여부"])
    new_df["운전자한정특별약관"] = terms_encoder(raw_df, labeling_param["운전자한정특별약관"])
    new_df["영상기록장치특약가입"] = blackbox_encoder(raw_df, labeling_param["영상기록장치특약가입"])


    return new_df


### 코드 실행

In [None]:
import numpy as np
from tqdm import tqdm
import time
from sklearn.metrics import mean_squared_error
from itertools import product
from sklearn.model_selection import train_test_split

# 빈 데이터프레임 생성



def test_all_cases(raw_df, preprocessing_params, labeling_params, other_params):

    columns = get_df_columns(other_params["model_name"])
    result_df = pd.DataFrame(columns=["label_name"] + columns)
    result_dir = other_params["result_dir"]

    # 모든 전처리 파라미터 조합에 대해 반복
    for params_values in tqdm(product(*preprocessing_params.values()), leave=True):
        preprocessing_param = dict(zip(preprocessing_params.keys(), params_values))

        # 모든 라벨링 파라미터 조합에 대해 반복
        for params_values in tqdm(product(*labeling_params.values()), leave=True):
            labeling_param = dict(zip(labeling_params.keys(), params_values))


            # label_name을 파라미터 조합으로 만듬.
            merged_dict = {**preprocessing_param, **labeling_param}
            label_name = str(merged_dict)

            try:
                # 전처리 후 데이터 프레임
                preprocessed_df = make_preprocessed_dataframe(raw_df, preprocessing_param)

                # 사고건수, 유효대수 제거
                df_yn = preprocessed_df.drop(columns = ['사고건수', '유효대수'])

                # x, y 분리
                yn_X = df_yn.drop(columns = ["사고유무"])
                yn_y = df_yn["사고유무"]

                # 데이터 프레임 라벨링 처리
                labeled_X_df = make_labeled_dataframe(yn_X, labeling_param)

                # train-test split
                X_train, X_test, y_train, y_test = train_test_split(labeled_X_df, yn_y, test_size = 0.2, random_state = other_params["random_state"])

                # model 결과
                new_row = model_result(other_params, X_train, X_test, y_train, y_test)
                new_row = [label_name] + new_row

                # upload to csv
                result_df.loc[len(result_df)] = new_row
                result_df.to_csv(result_dir, index=False)


            except Exception as ex: # 에러 처리.
                print("###########에러###########")
                print("에러내용: ", ex)
                print("에러라벨: ", label_name)



In [None]:
preprocessing_params = {
    "연령대제거" : [True, False],                                          # True: 제거, False: 제거 안함
    "연령성별가입기간 제외 후 중복제거" : [True, False],                     # True: 제거, False: 제거 안함
    "직전3년간사고건수" : [-1, 0],                                          # (-1: 제거, 0: 0으로 대체)
    "국산차량여부결측치제거" : [True],                                        # True only
    "유효대수이상치제거" : [0.70,0.71, 0.86, 0.91,0.95, 0.97, 0.99, 1],     # 상위 % 만큼 제거.
    "임직원한정제거": [True, False]                                         # True: 제거, False: 제거안함
}

labeling_params = {
    "연령대" : ["one-hot", "label_library", "label_handled", "label_numeric"], # one-hot, label 라이브러리, 은아, 민호
    "직전3년간사고건수" : ["one-hot", "label_library", "label_handled", "label_numeric"],
    "차량경과년수" : ["one-hot", "label_library", "label_handled", "label_numeric"],
    "가입경력코드" : ["one-hot", "label_library", "label_handled", "label_numeric"],
    "차량가입금액" : ["one-hot", "label_library", "label_handled", "label_numeric"],

    "성별" : ["one-hot"],
    "국산차량여부" : ["one-hot"],
    "운전자한정특별약관" : ["one-hot"],
    "영상기록장치특약가입" : ["one-hot"],

    # "유효대수" : [],
    # "사고건수" : []
}

other_params = {
    "random_state" : 39, # 모든 random_state 값 통일.
    "model_name" : "logi", # :logi", "rf"
    "result_dir" : '/content/drive/MyDrive/Dataset/things.csv'
}



test_all_cases(df, preprocessing_params, labeling_params, other_params)

## class 로 구현 (추후 예정... 이었는데 굳이 할 필요없을듯)

In [None]:
class classifier:
    def __init__(self, df):
        self.df = df
        self.remove_age_list = [0, 10]
        self.remove_column_list = ['연령대', '성별', '가입경력코드']
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None


    def processing(self, preprocessing_params):
        self.df = drop_nan_prod(self.df)
        self.df = remove_age(self.df, self.remove_age_list, preprocessing_params["연령대제거"])
        self.df = remove_columns_and_duplicates(self.df, self.remove_column_list, preprocessing_params["연령성별가입기간 제외 후 중복제거"])
        self.df = replace_NCR_nan(self.df, preprocessing_params["연령성별가입기간 제외 후 중복제거"])
        self.df = remove_SAGO_outliers(self.df , preprocessing_params["유효대수이상치제거"])
        self.df = remove_executives(self.df , preprocessing_params["임직원한정제거"])

    def get_X_y(self):
        df_yn = self.df.drop(columns = ['사고건수', '유효대수'])
        self.X = df_yn.drop(columns = ["사고유무"])
        self.y = df_yn["사고유무"]


    def get_train_test(self, random_state):
        from sklearn.model_selection import train_test_split

        self.X_train, self.X_test, self.y_train, y_test = train_test_split(self.X, self.y, test_size = 0.2, random_state = random_state)


    def train(self, model_name):
        model = model_name

        # 모델 학습
        model.fit(self.X_train, self.y_train)

        # 예측
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:,1].reshape(-1,1) # 양성 클래스에 대한 확률 추출

        # 평가지표 출력
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')  # F1 점수 계산
        auc = roc_auc_score(y_test, y_proba)

        print(f"정확도 : {accuracy} \n")
        print("분류 보고서 :\n", report, "\n")
        print(f"F1 점수 : {f1} \n")
        print(f"auc : {auc} \n")

        # FPR, TPR, 임계값
        fper, tper, thresholds = roc_curve(y_test, y_proba)

        # threshold 최대값의 인덱스, np.argmax()
        optimal_idx = np.argmax(tper - fper)
        optimal_threshold = thresholds[optimal_idx]
        optimal_fpr = fper[optimal_idx]

        print('optimal fpr:', optimal_fpr, ', threshold:', optimal_threshold, '\n\n\n')

        return accuracy, report, f1, auc


        # print(f"정확도 : {accuracy} \n")
        # print("분류 보고서 :\n", report, "\n")
        # print(f"F1 점수 : {f1} \n")
        # print(f"auc : {auc} \n")

        # fper, tper, thresholds = roc_curve(y_test, y_proba)
        # plot_roc_curve(fper, tper)

        # return model, y_pred

    def output(self):
        pass