### 라이브러리 호출

In [1]:
# 연산 처리  패키지
import pandas as pd
import numpy as np

# 시각화 패키지
from matplotlib import pyplot as plt
import seaborn as sns

# 통계분석 패키지
import statsmodels.api as sm

# 그래프 설정
%matplotlib inline

# 경고 메세지 무시
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score, roc_curve, mean_squared_error, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
import math
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC

In [4]:
# 코랩 mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### 모델 함수

In [3]:
# 로지스틱 회귀 함수
def logistic_classifier(X, y):

    # 평가지표 / best param 저장 list
    acc = []
    prec = []
    recall = []
    auc = []

    acc_opt = []
    prec_opt = []
    recall_opt = []

    opt_thres = []

    # LogisticRegression 초기화
    model = LogisticRegression()

    for i in range(100):

        # y 범주 비율에 맞춰 train-test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y)

        # 데이터 Random Sampling
        X_train_rd = X_train.sample(n = 10000)
        y_train_rd = y_train[X_train_rd.index]

        # SMOTE로 X oversampling
        smt = SMOTE(sampling_strategy = 'auto')
        X_train_sm, y_train_sm = smt.fit_resample(X_train_rd, y_train_rd)

        # 모델 학습 / 예측
        model.fit(X_train_sm, y_train_sm)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:,1] # 양성 클래스에 대한 확률 추출

        # thr = 0.5일 때 평가지표 저장
        acc.append((accuracy_score(y_test, y_pred)))
        prec.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        auc.append(roc_auc_score(y_test, y_proba))

        # Optimal thr 저장
        fper, tper, thresholds = roc_curve(y_test, y_proba)
        optimal_idx = np.argmax(tper - fper)   # fpr, tpr 간 차이가 가장 클 때의 index 저장

        # Optimal thr일 때 평가지표 저장
        y_optpred = (y_proba > thresholds[optimal_idx]).astype(int)

        acc_opt.append(accuracy_score(y_test, y_optpred))
        prec_opt.append(precision_score(y_test, y_optpred))
        recall_opt.append(recall_score(y_test, y_optpred))
        opt_thres.append(thresholds[optimal_idx])

        # 횟수 출력
        if((i+1)%20 == 0) :
            print(f"{i+1}번째 완료")

    # 결과 DataFrame 생성
    results = pd.DataFrame({
        '0.5 acc' : acc,
        '0.5 prec': prec,
        '0.5 recall': recall,
        'opt acc' : acc_opt,
        'opt prec' : prec_opt,
        'opt recall' : recall_opt,
        'auc': auc,
        'opt threshold' : opt_thres
    })

    return results

In [5]:
# base preprocess 데이터로부터 y값 읽어오기
base = pd.read_csv("/content/drive/MyDrive/기계학습의이해/Dataset/base_process.csv", encoding = "euc-kr", engine='python')
yn_y = base['사고유무']

In [6]:
yn_X = pd.read_csv(f"/content/drive/MyDrive/기계학습의이해/Dataset/DF/1111211.csv", encoding = 'euc-kr', engine='python')
predicted = pd.read_csv(f"/content/drive/MyDrive/기계학습의이해/Dataset/predicted_result.csv", encoding = 'UTF8', engine='python')

In [9]:
yn_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210315 entries, 0 to 210314
Data columns (total 36 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   여성            210315 non-null  bool   
 1   외산            210315 non-null  bool   
 2   1인 및 지정1인     210315 non-null  bool   
 3   가족 및 지정1인     210315 non-null  bool   
 4   가족및형제자매한정     210315 non-null  bool   
 5   가족한정(형제자매제외)  210315 non-null  bool   
 6   기명피보험자1인한정    210315 non-null  bool   
 7   누구나(기본)       210315 non-null  bool   
 8   부부 및 지정1인     210315 non-null  bool   
 9   부부한정          210315 non-null  bool   
 10  임직원한정         210315 non-null  bool   
 11  미가입           210315 non-null  bool   
 12  연령대           210315 non-null  float64
 13  C             210315 non-null  bool   
 14  D             210315 non-null  bool   
 15  N             210315 non-null  bool   
 16  Z             210315 non-null  bool   
 17  10년이하         210315 non-null  bool   
 18  5년이하

### 파생변수 추가

In [8]:
yn_X['연령대'].value_counts()

연령대
35.0    53562
44.0    46597
54.0    37075
64.0    28452
25.0    26302
73.0    13366
83.0     4327
93.0      524
19.0      110
Name: count, dtype: int64

In [11]:
# 연령대별 사고율
yn_X['연령대사고율'] = np.where(yn_X['연령대'] == 35, predicted['30대'],
                        np.where(yn_X['연령대'] == 44, predicted['40대'],
                        np.where(yn_X['연령대'] == 54, predicted['50대'],
                        np.where(yn_X['연령대'] == 64, predicted['~64'],
                        np.where(yn_X['연령대'] == 25, predicted['20대'],
                        np.where(yn_X['연령대'] == 73, predicted['65~'],
                        np.where(yn_X['연령대'] == 83, predicted['65~'],
                        np.where(yn_X['연령대'] == 93, predicted['65~'], predicted['20대'])))))))) # 10대가 지워짐


# 성별별 사고율
yn_X['성별사고율'] = np.where(yn_X['여성'] == False, predicted['남성'], predicted['여성'])

# 특약별 사고율
yn_X['특약사고율'] = np.where((yn_X['기명피보험자1인한정'] == True) | (yn_X['1인 및 지정1인'] == True), predicted['특약 기명피보험자1인'],
                        np.where((yn_X['부부한정'] == True) | (yn_X['부부 및 지정1인'] == True), predicted['특약부부운전'],
                        np.where((yn_X['가족한정(형제자매제외)'] == True) | (yn_X['가족및형제자매한정'] == True) | (yn_X['가족 및 지정1인'] == True), predicted['특약 가족'],
                        np.where(yn_X['누구나(기본)'] == True, predicted['특약 미가입'],
                        np.where(yn_X['임직원한정'] == True, predicted['특약 기타'], predicted['특약부부운전']))))) # 부부및자녀한정이지워짐
yn_X.head(3)

Unnamed: 0,여성,외산,1인 및 지정1인,가족 및 지정1인,가족및형제자매한정,가족한정(형제자매제외),기명피보험자1인한정,누구나(기본),부부 및 지정1인,부부한정,...,5.0,6.0,7.0,8.0,1억이하,5천만원이하,미가입.1,연령대사고율,성별사고율,특약사고율
0,True,False,False,False,False,True,False,False,False,False,...,False,False,False,True,False,True,False,0.000596,0.022789,0.062796
1,True,False,False,False,False,False,False,True,False,False,...,False,False,False,True,False,True,False,0.000596,0.022789,0.056164
2,True,False,False,False,False,True,False,False,False,False,...,False,False,False,True,False,False,True,0.000596,0.022789,0.062796


In [12]:
X1 = yn_X.drop(columns = ['연령대사고율', '성별사고율', '특약사고율'], axis = 1)
X2 = yn_X.drop(columns = ['성별사고율', '특약사고율'], axis = 1)
X3 = yn_X.drop(columns = ['연령대사고율', '특약사고율'], axis = 1)
X4 = yn_X.drop(columns = ['연령대사고율', '성별사고율'], axis = 1)
X5 = yn_X.drop(columns = ['특약사고율'], axis = 1)
X6 = yn_X.drop(columns = ['성별사고율'], axis = 1)
X7 = yn_X.drop(columns = ['연령대사고율'], axis = 1)
X8 = yn_X

### 모델 실행

In [13]:
# Logistic Classifier model fit

total = pd.DataFrame()

# X 지정
yn_X = X1

# smote 계산을 위해 data type float로 변경
for col_name in yn_X.columns:
    yn_X[col_name] = yn_X[col_name].astype(float)

# 모델 실행
logistic_rst = logistic_classifier(yn_X, yn_y)

# csv 전체 합쳐서 만드는 코드
logistic_rst['파생변수'] = '000'
total = pd.concat([total, logistic_rst])

total.to_csv("/content/drive/MyDrive/기계학습의이해/LR_3rd_result.csv", index = False, encoding="euc-kr")

20번째 완료
40번째 완료
60번째 완료
80번째 완료
100번째 완료


In [14]:
# X 지정
yn_X = X2

# smote 계산을 위해 data type float로 변경
for col_name in yn_X.columns:
    yn_X[col_name] = yn_X[col_name].astype(float)

# 모델 실행
logistic_rst = logistic_classifier(yn_X, yn_y)

# csv 전체 합쳐서 만드는 코드
logistic_rst['파생변수'] = '100'
total = pd.concat([total, logistic_rst])

total.to_csv("/content/drive/MyDrive/기계학습의이해/LR_3rd_result.csv", index = False, encoding="euc-kr")

################
# X 지정
yn_X = X3

# smote 계산을 위해 data type float로 변경
for col_name in yn_X.columns:
    yn_X[col_name] = yn_X[col_name].astype(float)

# 모델 실행
logistic_rst = logistic_classifier(yn_X, yn_y)

# csv 전체 합쳐서 만드는 코드
logistic_rst['파생변수'] = '010'
total = pd.concat([total, logistic_rst])

total.to_csv("/content/drive/MyDrive/기계학습의이해/LR_3rd_result.csv", index = False, encoding="euc-kr")

################
# X 지정
yn_X = X4

# smote 계산을 위해 data type float로 변경
for col_name in yn_X.columns:
    yn_X[col_name] = yn_X[col_name].astype(float)

# 모델 실행
logistic_rst = logistic_classifier(yn_X, yn_y)

# csv 전체 합쳐서 만드는 코드
logistic_rst['파생변수'] = '001'
total = pd.concat([total, logistic_rst])

total.to_csv("/content/drive/MyDrive/기계학습의이해/LR_3rd_result.csv", index = False, encoding="euc-kr")

###############
# X 지정
yn_X = X5

# smote 계산을 위해 data type float로 변경
for col_name in yn_X.columns:
    yn_X[col_name] = yn_X[col_name].astype(float)

# 모델 실행
logistic_rst = logistic_classifier(yn_X, yn_y)

# csv 전체 합쳐서 만드는 코드
logistic_rst['파생변수'] = '110'
total = pd.concat([total, logistic_rst])

total.to_csv("/content/drive/MyDrive/기계학습의이해/LR_3rd_result.csv", index = False, encoding="euc-kr")

################
# X 지정
yn_X = X6

# smote 계산을 위해 data type float로 변경
for col_name in yn_X.columns:
    yn_X[col_name] = yn_X[col_name].astype(float)

# 모델 실행
logistic_rst = logistic_classifier(yn_X, yn_y)

# csv 전체 합쳐서 만드는 코드
logistic_rst['파생변수'] = '101'
total = pd.concat([total, logistic_rst])

total.to_csv("/content/drive/MyDrive/기계학습의이해/LR_3rd_result.csv", index = False, encoding="euc-kr")

##################
# X 지정
yn_X = X7

# smote 계산을 위해 data type float로 변경
for col_name in yn_X.columns:
    yn_X[col_name] = yn_X[col_name].astype(float)

# 모델 실행
logistic_rst = logistic_classifier(yn_X, yn_y)

# csv 전체 합쳐서 만드는 코드
logistic_rst['파생변수'] = '011'
total = pd.concat([total, logistic_rst])

total.to_csv("/content/drive/MyDrive/기계학습의이해/LR_3rd_result.csv", index = False, encoding="euc-kr")

###############################
# X 지정
yn_X = X8

# smote 계산을 위해 data type float로 변경
for col_name in yn_X.columns:
    yn_X[col_name] = yn_X[col_name].astype(float)

# 모델 실행
logistic_rst = logistic_classifier(yn_X, yn_y)

# csv 전체 합쳐서 만드는 코드
logistic_rst['파생변수'] = '111'
total = pd.concat([total, logistic_rst])

total.to_csv("/content/drive/MyDrive/기계학습의이해/LR_3rd_result.csv", index = False, encoding="euc-kr")


20번째 완료
40번째 완료
60번째 완료
80번째 완료
100번째 완료
20번째 완료
40번째 완료
60번째 완료
80번째 완료
100번째 완료
20번째 완료
40번째 완료
60번째 완료
80번째 완료
100번째 완료
20번째 완료
40번째 완료
60번째 완료
80번째 완료
100번째 완료
20번째 완료
40번째 완료
60번째 완료
80번째 완료
100번째 완료
20번째 완료
40번째 완료
60번째 완료
80번째 완료
100번째 완료
20번째 완료
40번째 완료
60번째 완료
80번째 완료
100번째 완료


In [15]:
rst = pd.read_csv("/content/drive/MyDrive/기계학습의이해/LR_3rd_result.csv", encoding = "euc-kr", engine='python')
rst

Unnamed: 0,0.5 acc,0.5 prec,0.5 recall,opt acc,opt prec,opt recall,auc,opt threshold,파생변수
0,0.602287,0.221794,0.546859,0.598555,0.221303,0.553523,0.608512,0.497843,0
1,0.593514,0.219084,0.555367,0.612652,0.223420,0.529137,0.606338,0.509606,0
2,0.589520,0.214896,0.545725,0.565152,0.211669,0.584858,0.600062,0.487507,0
3,0.583316,0.215998,0.564724,0.609063,0.221120,0.527860,0.607479,0.512660,0
4,0.600100,0.220467,0.546151,0.584148,0.218276,0.573373,0.608762,0.490451,0
...,...,...,...,...,...,...,...,...,...
795,0.592564,0.213771,0.533957,0.566507,0.211428,0.580746,0.598055,0.485353,111
796,0.592516,0.216673,0.546859,0.572094,0.214442,0.582731,0.603995,0.488981,111
797,0.595559,0.220048,0.554941,0.558068,0.214455,0.614207,0.610304,0.479955,111
798,0.590186,0.219003,0.562739,0.547251,0.213914,0.635616,0.611711,0.474722,111


In [21]:
# charting : 1

# 결과값 top 50 비교
print("======= 50개 =======")
# opt acc 기준으로 내림차순 정렬 후 상위 10개 row 선택
results_sorted = rst.sort_values(by='opt acc', ascending=False).head(50)
results_sorted['파생변수'].value_counts()



파생변수
0      15
101    13
1       5
111     4
110     4
10      3
100     3
11      3
Name: count, dtype: int64

In [26]:
# 결과값 top 100 비교
print("======= 100개 =======")
# opt acc 기준으로 내림차순 정렬 후 상위 100개 row 선택
results_sorted = rst.sort_values(by='opt acc', ascending=False).head(100)
results_sorted['파생변수'].value_counts()



파생변수
0      23
101    19
1      15
110    10
11      9
111     8
10      8
100     8
Name: count, dtype: int64

In [24]:
# charting : 2
# 결과값 평균 비교
rst_accmean = rst.groupby('파생변수')['opt acc'].mean()
rst_accmean.sort_values(ascending = False)

파생변수
1      0.568691
101    0.568236
11     0.567963
0      0.566861
10     0.565955
111    0.561004
110    0.560116
100    0.559897
Name: opt acc, dtype: float64

In [25]:
# 결과값 중앙값 비교
rst_accmedian = rst.groupby('파생변수')['opt acc'].median()
rst_accmedian.sort_values(ascending = False)

파생변수
11     0.570085
10     0.569063
1      0.568183
101    0.565972
111    0.562478
0      0.560814
110    0.560659
100    0.560540
Name: opt acc, dtype: float64

### 결론

3111211 데이터는 파생변수 추가 X?...

In [None]:
# 파생변수 전처리 저장
# .to_csv("/content/drive/MyDrive/기계학습의이해/LR1111211_.csv", index = False, encoding="euc-kr")