### 라이브러리 호출

In [1]:
# 연산 처리  패키지
import pandas as pd
import numpy as np

# 시각화 패키지
from matplotlib import pyplot as plt
import seaborn as sns

# 통계분석 패키지
import statsmodels.api as sm

# 그래프 설정
%matplotlib inline

# 경고 메세지 무시
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score, roc_curve, mean_squared_error, precision_score, recall_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import math
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
# SMOTE를 사용한 oversampling 적용
from imblearn.over_sampling import SMOTE, SMOTENC
from collections import Counter

### Data 불러오기

In [6]:
# 코랩 mount
from google.colab import drive
drive.mount('/content/drive')

# 데이터 읽어오기
df1 = pd.read_csv("/content/drive/MyDrive/기계학습의이해/LR/LR2111211_011.csv", encoding="euc-kr", engine='python')
df2 = pd.read_csv("/content/drive/MyDrive/기계학습의이해/LR/LR1111211_001.csv", encoding="euc-kr", engine='python')
df3 = pd.read_csv("/content/drive/MyDrive/기계학습의이해/LR/LR3111211_000.csv", encoding="euc-kr", engine='python')

base = pd.read_csv("/content/drive/MyDrive/기계학습의이해/Dataset/base_process.csv", encoding = "euc-kr", engine='python')
yn_y = base['사고유무']

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Model 함수 정의

In [7]:
# 로지스틱 회귀 함수
def logistic_classifier(X, y):

    # 평가지표 list
    acc = []
    prec = []
    recall = []
    auc = []

    acc_opt = []
    prec_opt = []
    recall_opt = []

    best_param = []
    opt_thres = []

    # 하이퍼파라미터 그리드 설정
    param_grid = {
        'C': [0.1, 1, 10]
    }

    for i in range(100):

        # y 범주 비율에 맞춰 train-test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

        # 데이터 Random Sampling
        X_train_rd = X_train.sample(n = 15000)
        y_train_rd = y_train[X_train_rd.index]

        # 계산을 위해 datatype float로 변경
        for col_name in X_train_rd.columns:
            X_train_rd[col_name] = X_train_rd[col_name].astype(float)

        # SMOTE로 X oversampling
        smt = SMOTE(sampling_strategy = 'auto')
        X_train_sm, y_train_sm = smt.fit_resample(X_train_rd, y_train_rd)

        # GridSearchCV를 사용한 하이퍼파라미터 튜닝
        CV_lr = GridSearchCV(LogisticRegression(), param_grid = param_grid, cv = 5)
        CV_lr.fit(X_train_sm, y_train_sm)

        # 최적의 파라미터 저장
        best_param.append(CV_lr.best_params_)

        # 테스트 데이터셋에 대한 예측 수행
        y_pred = CV_lr.predict(X_test)

        # AUC 계산, threshold 조절한 경우 정확도 측정을 위한 1 발생 확률 계산
        y_proba = CV_lr.predict_proba(X_test)[:, 1]

        # thr = 0.5일 때 평가지표 저장
        acc.append(accuracy_score(y_test, y_pred))
        prec.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        auc.append(roc_auc_score(y_test, y_proba))

        # Optimal thr 저장
        fper, tper, thresholds = roc_curve(y_test, y_proba)
        optimal_idx = np.argmax(tper - fper)   # fpr, tpr 간 차이가 가장 클 때의 index 저장

        # Optimal thr일 때 평가지표 저장
        y_optpred = (y_proba > thresholds[optimal_idx]).astype(int)

        acc_opt.append(accuracy_score(y_test, y_pred))
        prec_opt.append(precision_score(y_test, y_optpred))
        recall_opt.append(recall_score(y_test, y_optpred))
        opt_thres.append(thresholds[optimal_idx])

        # 횟수 출력
        if ((i+1)%10 == 0) :
            print(f"{i+1}번째 완료")

    # 결과 DataFrame 생성
    results = pd.DataFrame({
        '0.5 acc' : acc,
        '0.5 prec': prec,
        '0.5 recall': recall,
        'opt acc' : acc_opt,
        'opt prec' : prec_opt,
        'opt recall' : recall_opt,
        'auc': auc,
        'best param' : best_param,
        'opt threshold' : opt_thres
    })

    return results

### Logistic Regression 결과 저장

In [8]:
# df1
df1_rst = logistic_classifier(df1, yn_y)

df1_rst.to_csv("/content/drive/MyDrive/기계학습의이해/LR/LR2111211_011_result.csv", index = False)

10번째 완료
20번째 완료
30번째 완료
40번째 완료
50번째 완료
60번째 완료
70번째 완료
80번째 완료
90번째 완료
100번째 완료


In [9]:
# df2
df2_rst = logistic_classifier(df2, yn_y)

df2_rst.to_csv("/content/drive/MyDrive/기계학습의이해/LR/LR1111211_001_result.csv", index = False)

10번째 완료
20번째 완료
30번째 완료
40번째 완료
50번째 완료
60번째 완료
70번째 완료
80번째 완료
90번째 완료
100번째 완료


In [10]:
# df3
df3_rst = logistic_classifier(df3, yn_y)

df3_rst.to_csv("/content/drive/MyDrive/기계학습의이해/LR/LR3111211_000_result.csv", index = False)

10번째 완료
20번째 완료
30번째 완료
40번째 완료
50번째 완료
60번째 완료
70번째 완료
80번째 완료
90번째 완료
100번째 완료


### 폰트

In [None]:
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
fonts-nanum is already the newest version (20200506-1).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.
/usr/share/fonts: caching, new cache contents: 0 fonts, 1 dirs
/usr/share/fonts/truetype: caching, new cache contents: 0 fonts, 3 dirs
/usr/share/fonts/truetype/humor-sans: caching, new cache contents: 1 fonts, 0 dirs
/usr/share/fonts/truetype/liberation: caching, new cache contents: 16 fonts, 0 dirs
/usr/share/fonts/truetype/nanum: caching, new cache contents: 12 fonts, 0 dirs
/usr/local/share/fonts: caching, new cache contents: 0 fonts, 0 dirs
/root/.local/share/fonts: skipping, no such directory
/root/.fonts: skipping, no such directory
/usr/share/fonts/truetype: skipping, looped directory detected
/usr/share/fonts/truetype/humor-sans: skipping, looped directory detected
/usr/share/fonts/truetype/liberation: skipping, looped directory detected
/usr/share/fonts/truetype/

In [None]:
import matplotlib as mpl
import matplotlib.font_manager as fm

print ('버전: ', mpl.__version__)
print ('설치 위치: ', mpl.__file__)
print ('설정 위치: ', mpl.get_configdir())
print ('캐시 위치: ', mpl.get_cachedir())


print ('설정파일 위치: ', mpl.matplotlib_fname())


font_list = fm.findSystemFonts(fontpaths=None, fontext='ttf')

# ttf 폰트 전체갯수
print(len(font_list))

font_list

버전:  3.7.1
설치 위치:  /usr/local/lib/python3.10/dist-packages/matplotlib/__init__.py
설정 위치:  /root/.config/matplotlib
캐시 위치:  /root/.cache/matplotlib
설정파일 위치:  /usr/local/lib/python3.10/dist-packages/matplotlib/mpl-data/matplotlibrc
29


['/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf',
 '/usr/share/fonts/truetype/nanum/NanumMyeongjoBold.ttf',
 '/usr/share/fonts/truetype/humor-sans/Humor-Sans.ttf',
 '/usr/share/fonts/truetype/liberation/LiberationMono-BoldItalic.ttf',
 '/usr/share/fonts/truetype/liberation/LiberationSansNarrow-Italic.ttf',
 '/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf',
 '/usr/share/fonts/truetype/nanum/NanumSquareR.ttf',
 '/usr/share/fonts/truetype/liberation/LiberationSansNarrow-Regular.ttf',
 '/usr/share/fonts/truetype/nanum/NanumSquareRoundR.ttf',
 '/usr/share/fonts/truetype/nanum/NanumSquareB.ttf',
 '/usr/share/fonts/truetype/liberation/LiberationSerif-Bold.ttf',
 '/usr/share/fonts/truetype/liberation/LiberationSansNarrow-Bold.ttf',
 '/usr/share/fonts/truetype/liberation/LiberationMono-Italic.ttf',
 '/usr/share/fonts/truetype/liberation/LiberationSerif-Regular.ttf',
 '/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
 '/usr/share/fonts/truetype/liber

In [None]:
# seabron으로 그래프 스타일 설정
import seaborn as sns

"""
아래 5개중 원하는 그래프 스타일 설정.
"""
# sns.set_style('whitegrid')
sns.set_style('darkgrid')
# sns.set_style('dark')
# sns.set_style('white')
# sns.set_style('ticks')

#----------------------------------------------------위 아래가 아예 다른내용이지만, seaborn설정후 폰트적용 셀을 매번 꼭 다시 실행해줘야하기에 셀을 붙여놨습니다.

# matplot으로 한글 font 설정
import matplotlib
import matplotlib.pyplot as plt

# 한글 폰트 사용시 글자를 선명하게
%config InlineBackend.figure_format = 'retina'

# '-' 음수 부호 보이게 설정
matplotlib.rc('axes', unicode_minus=False)

# 한글 폰트 설치
# plt.rc('font', family='NanumBarunGothic')
plt.rc('font', family='NanumGothic')

### 결과 확인

In [12]:
df1_rst['opt acc'].mean()

0.5923029741102632

In [13]:
df2_rst['opt acc'].mean()

0.5923227064165656

In [14]:
df3_rst['opt acc'].mean()

0.5913277702493879

별 차이가 없음... 그보다 정확도 왜이러지?????

In [15]:
df1_rst['0.5 acc'].mean()

0.5923029741102632

In [16]:
df2_rst['0.5 acc'].mean()

0.5923227064165656

In [17]:
df3_rst['0.5 acc'].mean()

0.5913277702493879