### 라이브러리 호출

In [10]:
# 연산 처리  패키지
import pandas as pd
import numpy as np

# 시각화 패키지
from matplotlib import pyplot as plt
import seaborn as sns

# 통계분석 패키지
import statsmodels.api as sm

# 그래프 설정
%matplotlib inline

# 경고 메세지 무시
import warnings
warnings.filterwarnings('ignore')

In [11]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score, roc_curve, mean_squared_error, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
import math
# from sklearn.model_selection import cross_val_score, cross_validate
# from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC

In [12]:
# 코랩 mount
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#### 모델 함수

In [13]:
# 로지스틱 회귀 함수
def logistic_classifier(X, y):

    # 평가지표 / best param 저장 list
    acc = []
    prec = []
    recall = []
    auc = []

    acc_opt = []
    prec_opt = []
    recall_opt = []

    opt_thres = []

    # LogisticRegression 초기화
    model = LogisticRegression()

    for i in range(100):

        # y 범주 비율에 맞춰 train-test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y)

        # 데이터 Random Sampling
        X_train_rd = X_train.sample(n = 10000)
        y_train_rd = y_train[X_train_rd.index]

        # SMOTE로 X oversampling
        smt = SMOTE(sampling_strategy = 'auto')
        X_train_sm, y_train_sm = smt.fit_resample(X_train_rd, y_train_rd)

        # 모델 학습 / 예측
        model.fit(X_train_sm, y_train_sm)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:,1] # 양성 클래스에 대한 확률 추출

        # thr = 0.5일 때 평가지표 저장
        acc.append((accuracy_score(y_test, y_pred)))
        prec.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        auc.append(roc_auc_score(y_test, y_proba))

        # Optimal thr 저장
        fper, tper, thresholds = roc_curve(y_test, y_proba)
        optimal_idx = np.argmax(tper - fper)   # fpr, tpr 간 차이가 가장 클 때의 index 저장

        # Optimal thr일 때 평가지표 저장
        y_optpred = (y_proba > thresholds[optimal_idx]).astype(int)

        acc_opt.append(accuracy_score(y_test, y_optpred))
        prec_opt.append(precision_score(y_test, y_optpred))
        recall_opt.append(recall_score(y_test, y_optpred))
        opt_thres.append(thresholds[optimal_idx])

        # 횟수 출력
        if((i+1)%10 == 0) :
            print(f"{i+1}번째 완료")

    # 결과 DataFrame 생성
    results = pd.DataFrame({
        '0.5 acc' : acc,
        '0.5 prec': prec,
        '0.5 recall': recall,
        'opt acc' : acc_opt,
        'opt prec' : prec_opt,
        'opt recall' : recall_opt,
        'auc': auc,
        'opt threshold' : opt_thres
    })

    return results

### Logistic Regression 결과

In [14]:
# base preprocess 데이터로부터 y값 읽어오기
base = pd.read_csv("/content/drive/MyDrive/기계학습의이해/Dataset/base_process.csv", encoding = "euc-kr", engine='python')
yn_y = base['사고유무']

In [15]:
yn_X = pd.read_csv(f"/content/drive/MyDrive/기계학습의이해/Dataset/DF/2111211.csv", encoding = 'euc-kr', engine='python')
predicted = pd.read_csv(f"/content/drive/MyDrive/기계학습의이해/Dataset/predicted_result.csv", encoding = 'UTF8', engine='python')

### 파생변수 추가

In [16]:
# 연령대별 사고율
yn_X['연령대사고율'] = np.where(yn_X['연령대'] == 30, predicted['30대'],
                          np.where(yn_X['연령대'] == 40, predicted['40대'],
                          np.where(yn_X['연령대'] == 50, predicted['50대'],
                          np.where(yn_X['연령대'] == 60, predicted['~64'],
                          np.where(yn_X['연령대'] <= 20, predicted['20대'],
                          np.where(yn_X['연령대'] >= 70, predicted['65~'], np.nan))))))

# 성별별 사고율
yn_X['성별사고율'] = np.where(yn_X['여성'] == False, predicted['남성'], predicted['여성'])

# 특약별 사고율
yn_X['특약사고율'] = np.where((yn_X['기명피보험자1인한정'] == True) | (yn_X['1인 및 지정1인'] == True), predicted['특약 기명피보험자1인'],
                        np.where((yn_X['부부한정'] == True) | (yn_X['부부 및 지정1인'] == True), predicted['특약부부운전'],
                        np.where((yn_X['가족한정(형제자매제외)'] == True) | (yn_X['가족및형제자매한정'] == True) | (yn_X['가족 및 지정1인'] == True), predicted['특약 가족'],
                        np.where(yn_X['누구나(기본)'] == True, predicted['특약 미가입'],
                        np.where(yn_X['임직원한정'] == True, predicted['특약 기타'], predicted['특약부부운전']))))) # 부부및자녀한정이지워짐
yn_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210315 entries, 0 to 210314
Data columns (total 39 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   여성            210315 non-null  bool   
 1   외산            210315 non-null  bool   
 2   1인 및 지정1인     210315 non-null  bool   
 3   가족 및 지정1인     210315 non-null  bool   
 4   가족및형제자매한정     210315 non-null  bool   
 5   가족한정(형제자매제외)  210315 non-null  bool   
 6   기명피보험자1인한정    210315 non-null  bool   
 7   누구나(기본)       210315 non-null  bool   
 8   부부 및 지정1인     210315 non-null  bool   
 9   부부한정          210315 non-null  bool   
 10  임직원한정         210315 non-null  bool   
 11  미가입           210315 non-null  bool   
 12  연령대           210315 non-null  int64  
 13  C             210315 non-null  bool   
 14  D             210315 non-null  bool   
 15  N             210315 non-null  bool   
 16  Z             210315 non-null  bool   
 17  10년이하         210315 non-null  bool   
 18  5년이하

In [17]:
X1 = yn_X.drop(columns = ['연령대사고율', '성별사고율', '특약사고율'], axis = 1)
X2 = yn_X.drop(columns = ['성별사고율', '특약사고율'], axis = 1)
X3 = yn_X.drop(columns = ['연령대사고율', '특약사고율'], axis = 1)
X4 = yn_X.drop(columns = ['연령대사고율', '성별사고율'], axis = 1)
X5 = yn_X.drop(columns = ['특약사고율'], axis = 1)
X6 = yn_X.drop(columns = ['성별사고율'], axis = 1)
X7 = yn_X.drop(columns = ['연령대사고율'], axis = 1)
X8 = yn_X

### 모델 실행

In [36]:
# Logistic Classifier model fit

total = pd.DataFrame()

# X 지정
yn_X = X1

# smote 계산을 위해 data type float로 변경
for col_name in yn_X.columns:
    yn_X[col_name] = yn_X[col_name].astype(float)

# 모델 실행
logistic_rst = logistic_classifier(yn_X, yn_y)

# csv 전체 합쳐서 만드는 코드
logistic_rst['파생변수'] = "000"
total = pd.concat([total, logistic_rst])

total.to_csv("/content/drive/MyDrive/기계학습의이해/LR_1st_result.csv", index = False, encoding="euc-kr")

10번째 완료
20번째 완료
30번째 완료
40번째 완료
50번째 완료
60번째 완료
70번째 완료
80번째 완료
90번째 완료
100번째 완료


In [38]:
# X 지정
yn_X = X2

# smote 계산을 위해 data type float로 변경
for col_name in yn_X.columns:
    yn_X[col_name] = yn_X[col_name].astype(float)

# 모델 실행
logistic_rst = logistic_classifier(yn_X, yn_y)

# csv 전체 합쳐서 만드는 코드
logistic_rst['파생변수'] = "100"
total = pd.concat([total, logistic_rst])

total.to_csv("/content/drive/MyDrive/기계학습의이해/LR_1st_result.csv", index = False, encoding="euc-kr")

################
# X 지정
yn_X = X3

# smote 계산을 위해 data type float로 변경
for col_name in yn_X.columns:
    yn_X[col_name] = yn_X[col_name].astype(float)

# 모델 실행
logistic_rst = logistic_classifier(yn_X, yn_y)

# csv 전체 합쳐서 만드는 코드
logistic_rst['파생변수'] = "010"
total = pd.concat([total, logistic_rst])

total.to_csv("/content/drive/MyDrive/기계학습의이해/LR_1st_result.csv", index = False, encoding="euc-kr")

################
# X 지정
yn_X = X4

# smote 계산을 위해 data type float로 변경
for col_name in yn_X.columns:
    yn_X[col_name] = yn_X[col_name].astype(float)

# 모델 실행
logistic_rst = logistic_classifier(yn_X, yn_y)

# csv 전체 합쳐서 만드는 코드
logistic_rst['파생변수'] = "001"
total = pd.concat([total, logistic_rst])

total.to_csv("/content/drive/MyDrive/기계학습의이해/LR_1st_result.csv", index = False, encoding="euc-kr")

###############
# X 지정
yn_X = X5

# smote 계산을 위해 data type float로 변경
for col_name in yn_X.columns:
    yn_X[col_name] = yn_X[col_name].astype(float)

# 모델 실행
logistic_rst = logistic_classifier(yn_X, yn_y)

# csv 전체 합쳐서 만드는 코드
logistic_rst['파생변수'] = "110"
total = pd.concat([total, logistic_rst])

total.to_csv("/content/drive/MyDrive/기계학습의이해/LR_1st_result.csv", index = False, encoding="euc-kr")

################
# X 지정
yn_X = X6

# smote 계산을 위해 data type float로 변경
for col_name in yn_X.columns:
    yn_X[col_name] = yn_X[col_name].astype(float)

# 모델 실행
logistic_rst = logistic_classifier(yn_X, yn_y)

# csv 전체 합쳐서 만드는 코드
logistic_rst['파생변수'] = "101"
total = pd.concat([total, logistic_rst])

total.to_csv("/content/drive/MyDrive/기계학습의이해/LR_1st_result.csv", index = False, encoding="euc-kr")

##################
# X 지정
yn_X = X7

# smote 계산을 위해 data type float로 변경
for col_name in yn_X.columns:
    yn_X[col_name] = yn_X[col_name].astype(float)

# 모델 실행
logistic_rst = logistic_classifier(yn_X, yn_y)

# csv 전체 합쳐서 만드는 코드
logistic_rst['파생변수'] = "011"
total = pd.concat([total, logistic_rst])

total.to_csv("/content/drive/MyDrive/기계학습의이해/LR_1st_result.csv", index = False, encoding="euc-kr")

###############################
# X 지정
yn_X = X8

# smote 계산을 위해 data type float로 변경
for col_name in yn_X.columns:
    yn_X[col_name] = yn_X[col_name].astype(float)

# 모델 실행
logistic_rst = logistic_classifier(yn_X, yn_y)

# csv 전체 합쳐서 만드는 코드
logistic_rst['파생변수'] = "111"
total = pd.concat([total, logistic_rst])

total.to_csv("/content/drive/MyDrive/기계학습의이해/LR_1st_result.csv", index = False, encoding="euc-kr")


10번째 완료
20번째 완료
30번째 완료
40번째 완료
50번째 완료
60번째 완료
70번째 완료
80번째 완료
90번째 완료
100번째 완료
10번째 완료
20번째 완료
30번째 완료
40번째 완료
50번째 완료
60번째 완료
70번째 완료
80번째 완료
90번째 완료
100번째 완료
10번째 완료
20번째 완료
30번째 완료
40번째 완료
50번째 완료
60번째 완료
70번째 완료
80번째 완료
90번째 완료
100번째 완료
10번째 완료
20번째 완료
30번째 완료
40번째 완료
50번째 완료
60번째 완료
70번째 완료
80번째 완료
90번째 완료
100번째 완료
10번째 완료
20번째 완료
30번째 완료
40번째 완료
50번째 완료
60번째 완료
70번째 완료
80번째 완료
90번째 완료
100번째 완료
10번째 완료
20번째 완료
30번째 완료
40번째 완료
50번째 완료
60번째 완료
70번째 완료
80번째 완료
90번째 완료
100번째 완료
10번째 완료
20번째 완료
30번째 완료
40번째 완료
50번째 완료
60번째 완료
70번째 완료
80번째 완료
90번째 완료
100번째 완료


In [4]:
rst = pd.read_csv("/content/drive/MyDrive/기계학습의이해/LR_1st_result.csv", encoding = "euc-kr", engine='python')
rst

Unnamed: 0,0.5 acc,0.5 prec,0.5 recall,opt acc,opt prec,opt recall,auc,opt threshold,파생변수
0,0.589544,0.218616,0.562456,0.551316,0.214244,0.628243,0.608577,0.478157,0
1,0.597841,0.219594,0.547568,0.583363,0.218283,0.575216,0.606596,0.491913,0
2,0.591351,0.215537,0.544449,0.617360,0.220857,0.507160,0.599184,0.513422,0
3,0.590257,0.213151,0.536368,0.504006,0.203826,0.673756,0.599821,0.453575,0
4,0.603262,0.220384,0.538352,0.599410,0.219942,0.545442,0.608080,0.497694,0
...,...,...,...,...,...,...,...,...,...
795,0.589235,0.215229,0.547852,0.546252,0.211203,0.623848,0.601489,0.475547,111
796,0.594228,0.213546,0.529278,0.491620,0.202194,0.689777,0.600578,0.448686,111
797,0.580795,0.213652,0.559620,0.543732,0.209774,0.622005,0.601535,0.480058,111
798,0.589782,0.219170,0.564441,0.566103,0.216305,0.605274,0.608838,0.486708,111


In [5]:
# charting : 1

# 결과값 top 50 비교
print("======= 50개 =======")
# opt acc 기준으로 내림차순 정렬 후 상위 10개 row 선택
results_sorted = rst.sort_values(by='opt acc', ascending=False).head(50)
results_sorted['파생변수'].value_counts()



파생변수
11     14
10     10
111     6
101     6
0       5
100     4
1       3
110     2
Name: count, dtype: int64

In [6]:
# 결과값 top 100 비교
print("======= 100개 =======")
# opt acc 기준으로 내림차순 정렬 후 상위 100개 row 선택
results_sorted = rst.sort_values(by='opt acc', ascending=False).head(100)
results_sorted['파생변수'].value_counts()



파생변수
11     21
10     16
101    15
111    11
1      11
100     9
0       9
110     8
Name: count, dtype: int64

In [7]:
# charting : 2
# 결과값 평균 비교
rst_accmean = rst.groupby('파생변수')['opt acc'].mean()
rst_accmean.sort_values(ascending = False)

파생변수
11     0.571699
10     0.570327
101    0.567427
1      0.567102
110    0.566117
111    0.566008
0      0.564149
100    0.563669
Name: opt acc, dtype: float64

In [8]:
# 결과값 중앙값 비교
rst_accmedian = rst.groupby('파생변수')['opt acc'].median()
rst_accmedian.sort_values(ascending = False)

파생변수
10     0.573081
11     0.570715
101    0.569527
110    0.568100
111    0.567839
1      0.566923
0      0.565877
100    0.563417
Name: opt acc, dtype: float64

### 결론

2111211 데이터는 성별, 특약 사고율을 추가해 모델링

In [18]:
X7.to_csv("/content/drive/MyDrive/기계학습의이해/LR/LR2111211_011.csv", index = False, encoding="euc-kr")