# LogisticRegression 최적화

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('./data/train.csv').drop(columns=['UID'])

In [3]:
df.info()   # non-null: 이상치 없음

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   주거 형태              10000 non-null  object 
 1   연간 소득              10000 non-null  float64
 2   현재 직장 근속 연수        10000 non-null  object 
 3   체납 세금 압류 횟수        10000 non-null  float64
 4   개설된 신용계좌 수         10000 non-null  int64  
 5   신용 거래 연수           10000 non-null  float64
 6   최대 신용한도            10000 non-null  float64
 7   신용 문제 발생 횟수        10000 non-null  int64  
 8   마지막 연체 이후 경과 개월 수  10000 non-null  int64  
 9   개인 파산 횟수           10000 non-null  int64  
 10  대출 목적              10000 non-null  object 
 11  대출 상환 기간           10000 non-null  object 
 12  현재 대출 잔액           10000 non-null  float64
 13  현재 미상환 신용액         10000 non-null  float64
 14  월 상환 부채액           10000 non-null  float64
 15  신용 점수              10000 non-null  int64  
 16  채무 불이행 여부          1000

In [4]:
Y = df['채무 불이행 여부']
X = df.drop('채무 불이행 여부', axis=1)

print(X)

                주거 형태      연간 소득 현재 직장 근속 연수  체납 세금 압류 횟수  개설된 신용계좌 수  \
0                  자가  1941337.5      10년 이상          0.0           9   
1                  월세  1979505.0      10년 이상          0.0           5   
2                  월세  1356381.0          4년          0.0          12   
3                  월세  1049017.5          6년          0.0          15   
4                  월세  4320217.5          2년          0.0          11   
...               ...        ...         ...          ...         ...   
9995  주택 담보 대출 (거주 중)  1339473.0      10년 이상          0.0           9   
9996  주택 담보 대출 (거주 중)  2297230.5          2년          0.0          11   
9997  주택 담보 대출 (거주 중)  1221523.5      10년 이상          0.0           9   
9998               자가  3343584.0      10년 이상          0.0          10   
9999  주택 담보 대출 (거주 중)  2175133.5          5년          0.0           5   

      신용 거래 연수   최대 신용한도  신용 문제 발생 횟수  마지막 연체 이후 경과 개월 수  개인 파산 횟수   대출 목적  \
0         13.4  400597.5            0        

In [5]:
# 원 핫 인코딩
categories = ['주거 형태', '현재 직장 근속 연수', '대출 목적', '대출 상환 기간']


X = pd.get_dummies(X, columns = categories, dtype=int) 

X

Unnamed: 0,연간 소득,체납 세금 압류 횟수,개설된 신용계좌 수,신용 거래 연수,최대 신용한도,신용 문제 발생 횟수,마지막 연체 이후 경과 개월 수,개인 파산 횟수,현재 대출 잔액,현재 미상환 신용액,...,대출 목적_소규모 사업 자금,대출 목적_여행 자금,대출 목적_의료비,대출 목적_이사 비용,대출 목적_자동차 구매,대출 목적_주택 개보수,대출 목적_주택 구매,대출 목적_휴가 비용,대출 상환 기간_단기 상환,대출 상환 기간_장기 상환
0,1941337.5,0.0,9,13.4,400597.5,0,24,1,390903.0,225457.5,...,0,0,0,0,0,0,0,0,1,0
1,1979505.0,0.0,5,15.1,360679.5,0,11,0,1002184.5,64749.0,...,0,0,0,0,0,0,0,0,1,0
2,1356381.0,0.0,12,18.8,491770.5,1,74,3,227775.0,487644.0,...,0,0,0,0,0,0,0,0,1,0
3,1049017.5,0.0,15,14.8,411546.0,1,22,1,251383.5,413211.0,...,0,0,0,0,0,0,0,0,1,0
4,4320217.5,0.0,11,26.1,895288.5,0,32,0,1163176.5,78991.5,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1339473.0,0.0,9,18.7,319027.5,0,68,0,126216.0,177028.5,...,0,0,0,0,0,0,0,0,1,0
9996,2297230.5,0.0,11,28.3,399799.5,0,7,0,371907.0,347449.5,...,0,0,0,0,0,1,0,0,0,1
9997,1221523.5,0.0,9,30.1,823305.0,0,14,0,869736.0,176905.5,...,0,0,0,0,0,0,0,0,0,1
9998,3343584.0,0.0,10,20.3,724314.0,0,25,0,443008.5,139294.5,...,0,0,0,0,0,0,0,0,1,0


In [6]:
# 로그
cl = ['현재 대출 잔액', '현재 미상환 신용액', '월 상환 부채액']

for i in cl:
    X[i] = X[i].apply(lambda x: np.log1p(x))

X

Unnamed: 0,연간 소득,체납 세금 압류 횟수,개설된 신용계좌 수,신용 거래 연수,최대 신용한도,신용 문제 발생 횟수,마지막 연체 이후 경과 개월 수,개인 파산 횟수,현재 대출 잔액,현재 미상환 신용액,...,대출 목적_소규모 사업 자금,대출 목적_여행 자금,대출 목적_의료비,대출 목적_이사 비용,대출 목적_자동차 구매,대출 목적_주택 개보수,대출 목적_주택 구매,대출 목적_휴가 비용,대출 상환 기간_단기 상환,대출 상환 기간_장기 상환
0,1941337.5,0.0,9,13.4,400597.5,0,24,1,12.876217,12.325891,...,0,0,0,0,0,0,0,0,1,0
1,1979505.0,0.0,5,15.1,360679.5,0,11,0,13.817694,11.078289,...,0,0,0,0,0,0,0,0,1,0
2,1356381.0,0.0,12,18.8,491770.5,1,74,3,12.336118,13.097343,...,0,0,0,0,0,0,0,0,1,0
3,1049017.5,0.0,15,14.8,411546.0,1,22,1,12.434739,12.931716,...,0,0,0,0,0,0,0,0,1,0
4,4320217.5,0.0,11,26.1,895288.5,0,32,0,13.966666,11.277108,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1339473.0,0.0,9,18.7,319027.5,0,68,0,11.745758,12.084072,...,0,0,0,0,0,0,0,0,1,0
9996,2297230.5,0.0,11,28.3,399799.5,0,7,0,12.826402,12.758377,...,0,0,0,0,0,1,0,0,0,1
9997,1221523.5,0.0,9,30.1,823305.0,0,14,0,13.675946,12.083377,...,0,0,0,0,0,0,0,0,0,1
9998,3343584.0,0.0,10,20.3,724314.0,0,25,0,13.001346,11.844353,...,0,0,0,0,0,0,0,0,1,0


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=42)

In [8]:
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Optuna 최적화 함수 정의
def optimize_logistic(trial):
    params = {
        "C": trial.suggest_float("C", 0.0001, 10.0, log=True),  # 정규화 강도 (log scale 탐색)
        "solver": trial.suggest_categorical("solver", ["liblinear", "lbfgs", "saga"]),  # 최적화 알고리즘
        "max_iter": trial.suggest_int("max_iter", 100, 1000)  # 최대 반복 횟수
    }

    # 모델 생성 및 학습
    model = LogisticRegression(**params, random_state=42)
    model.fit(X_train, y_train)
    
    # 예측 및 성능 평가
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)  # 정확도 최적화

# Optuna 스터디 생성 및 최적화 실행
study_logistic = optuna.create_study(direction="maximize")
study_logistic.optimize(optimize_logistic, n_trials=100)  # 50번 탐색

# 최적의 하이퍼파라미터 출력
best_params_logistic = study_logistic.best_params
print("Best Logistic Regression Hyperparameters:", best_params_logistic)

# 최적의 하이퍼파라미터로 모델 학습
best_logistic = LogisticRegression(**best_params_logistic, random_state=42)
best_logistic.fit(X_train, y_train)

# 최적 모델 평가
y_pred_best = best_logistic.predict(X_test)
final_accuracy = accuracy_score(y_test, y_pred_best)
print("Final Optimized Logistic Regression Accuracy:", final_accuracy)


  from .autonotebook import tqdm as notebook_tqdm
[I 2025-03-20 12:32:46,764] A new study created in memory with name: no-name-5a0034cb-ca95-415e-8d9d-104c4ad3ed09
[I 2025-03-20 12:32:46,797] Trial 0 finished with value: 0.6472 and parameters: {'C': 0.0019151737876718714, 'solver': 'liblinear', 'max_iter': 641}. Best is trial 0 with value: 0.6472.
[I 2025-03-20 12:32:49,503] Trial 1 finished with value: 0.6472 and parameters: {'C': 0.5561301181776523, 'solver': 'saga', 'max_iter': 595}. Best is trial 0 with value: 0.6472.
[I 2025-03-20 12:32:49,516] Trial 2 finished with value: 0.6472 and parameters: {'C': 0.6964826658095727, 'solver': 'liblinear', 'max_iter': 323}. Best is trial 0 with value: 0.6472.
[I 2025-03-20 12:32:50,759] Trial 3 finished with value: 0.6472 and parameters: {'C': 6.813459222896399, 'solver': 'saga', 'max_iter': 434}. Best is trial 0 with value: 0.6472.
[I 2025-03-20 12:32:50,771] Trial 4 finished with value: 0.6472 and parameters: {'C': 0.006082341373079886, 'sol

Best Logistic Regression Hyperparameters: {'C': 0.11393453754325446, 'solver': 'lbfgs', 'max_iter': 522}
Final Optimized Logistic Regression Accuracy: 0.6956


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
