## pandas, numpy,  sklearn, seaborn, matplotlib, xgboost, optuna, pycaret, lgbm

## load packages

In [None]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import RandomForestClassifier
import optuna
import pycaret
from pycaret import classification
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from catboost import CatBoostRegressor

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

df = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
submission = pd.read_csv("./sample_submission.csv")

### macro_f1 set-up

In [None]:
def macro_f1(y_true, y_pred, labels=None, average='macro'):
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, labels=labels, average=None)
    
    macro_precision = precision.mean()
    macro_recall = recall.mean()
    macro_f1 = f1.mean()
    
    return macro_f1

### 근로기간 조정(정수형 변수로 변경)

In [None]:
def unknowns(value):
    if value=='Unknown':
        return 1
    else:
        return 0
    
df['work_ukn'] = df['근로기간'].apply(unknowns)
test['work_ukn'] = test['근로기간'].apply(unknowns)

replace_set = {
    '< 1 year' : 0,
    '<1 year' : 0,
    '1 year' : 1,
    '1 years' : 1,
    '2 years' : 2,
    '3 years' : 3,
    '4 years' : 4,
    '5 years' : 5,
    '6 years' : 6,
    '7 years' : 7,
    '8 years' : 8,
    '9 years' : 9,
    '10 years' : 10,
    '10+ years' : 12,
    '10+years' : 12,
    '3' : 3,
    'Unknown' : 0
}
df['근로기간'].replace(replace_set, inplace=True)
test['근로기간'].replace(replace_set, inplace=True)

### Column Adjust(원금, 이자, 대출간 관계 조정)

In [None]:
# 원래 남이 쓴 코드
# 결괏값에 영향을 미치는 key codes

df['원금/대출'] = df['총상환원금']/df['대출금액']
df['이자/대출'] = df['총상환이자']/df['대출금액']

test['원금/대출'] = test['총상환원금']/test['대출금액']
test['이자/대출'] = test['총상환이자']/test['대출금액']

### Categorical Columns(대출 기간) Adjust

In [None]:
for i in range(len(df)):
    if df.loc[i, '대출기간'] == '36 months':
        df.loc[i, '대출기간'] = 36
    else:
        df.loc[i, '대출기간'] = 60
        
for i in range(len(test)):
    if test.loc[i, '대출기간'] == '36 months':
        test.loc[i, '대출기간'] = 36
    else:
        test.loc[i, '대출기간'] = 60

### One Hot Encoding - Categorical Feature Adjust

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded = ohe.fit_transform(df[['주택소유상태', '대출목적']])
encoded = pd.DataFrame(encoded, columns = ohe.get_feature_names_out())

df.drop(columns = ['주택소유상태', '대출목적'], inplace=True)

df = pd.concat([df, encoded], axis=1)

test_encoded = ohe.transform(test[['주택소유상태', '대출목적']])
test_encoded = pd.DataFrame(test_encoded, columns=ohe.get_feature_names_out())
test.drop(columns = ['주택소유상태', '대출목적'], inplace=True)
test = pd.concat([test, test_encoded], axis=1)

### feature scaling(MinMaxScaler)

In [None]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
cols = df.columns[1:12]
df[cols] = mms.fit_transform(df[cols])
test[cols] = mms.transform(test[cols])

### target feature adjust. ordinal scaling

In [None]:
classes = {
    'A': 6,
    'B': 5,
    'C': 4,
    'D': 3,
    'E': 2,
    'F': 1,
    'G': 0
}

target.replace(classes, inplace=True)

In [None]:
train_id = df['ID']
test_id = test['ID']

df.drop(columns = 'ID', inplace=True)
test.drop(columns = 'ID', inplace=True)

### train test split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df, target)

### pycaret, model selection

In [None]:
from pycaret.classification import *
exp1 = setup(data=df, target=target)
compare_models()

### lgbm classifier hyper-parameter setting py optuna and stratified KFold

In [None]:
def objective_lgbm(trial):
    learning_rate = trial.suggest_categorical('learning_rate', [0.01, 0.015, 0.1, 0.15, 0.2])
    num_leaves = trial.suggest_categorical('num_leaves', [20, 50, 100, 150])
    n_estimators = trial.suggest_categorical('n_estimators', [50, 100, 200, 300])
    max_depth = trial.suggest_categorical('max_depth', [7, 10, 15, 50, 200, 500])
    colsample_bytree = trial.suggest_categorical('colsample_bytree', [0.8, 0.9, 1.0])
#     eval_metrics = trial.suggest_categorical('eval_metrics', ['error'])
#     n_estimators = trial.suggest_categorical('n_estimators', [50, 100, 200, 300, 500])
    
    
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    
    for train_index, test_index in kf.split(x_train, y_train):
        model = LGBMClassifier(learning_rate = learning_rate,
                               num_leaves = num_leaves,
                               n_estimators = n_estimators,
                               max_depth = max_depth,
                               colsample_bytree = colsample_bytree,
                               verbose=-1
                              )
        model.fit(x_train, y_train)
        preds = model.predict(x_test)
        score = macro_f1(y_test, preds)
        scores.append(score)

    mean_score = sum(scores) / len(scores)
    return mean_score

study_lgbm = optuna.create_study(direction='maximize')
study_lgbm.optimize(objective_lgbm, n_trials=100)

### xgb classifier

In [None]:
def objective_xgb(trial):
    learning_rate = trial.suggest_categorical('learning_rate', [0.01, 0.015, 0.1, 0.15, 0.2])
    min_child_weight = trial.suggest_categorical('min_child_weight', [0, 1, 2, 3])
    max_depth = trial.suggest_categorical('max_depth', [7, 10, 15, 50, 200, 500])
    objective = trial.suggest_categorical('objective', ['multi:softmax'])
    num_class = trial.suggest_categorical('num_class', [7])
#     eval_metrics = trial.suggest_categorical('eval_metrics', ['error'])
#     n_estimators = trial.suggest_categorical('n_estimators', [50, 100, 200, 300, 500])
    
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    for train_index, test_index in kf.split(x_train, y_train):
        model = XGBClassifier(learning_rate = learning_rate, 
                              max_depth=max_depth, 
                              min_child_weight= min_child_weight, 
                              objective= objective,
                              num_class= num_class)

        model.fit(x_train, y_train)
        preds = model.predict(x_test)
        score = macro_f1(y_test, preds)
        scores.append(score)
        
    mean_score = sum(scores)/len(scores)
    
    return mean_score'

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=100)

In [None]:
lgbm_clf = LGBMClassifier(learning_rate = 0.1, num_leaves = 140, n_estimators = 350, max_depth=30)

In [None]:
xgb_clf = XGBClassifier(learning_rate = 0.2, min_child_weight=0, max_depth=500, objective='multi:softmax', num_class=7)

In [None]:
lgbm_clf.fit(x_train, y_train)

In [None]:
xgb_clf.fit(x_train, y_train)

In [None]:
lgbm_clf.fit(x_train, y_train)
preds_lgbm = lgbm_clf.predict(x_test)
pred_proba_lgbm = lgbm_clf.predict_proba(x_test)
pred_proba_lgbm = pd.DataFrame(pred_proba_lgbm)

In [None]:
xgb_clf.fit(x_train, y_train)
preds_xgb = xgb_clf.predict(x_test)
pred_proba_xgb = xgb_clf.predict_proba(x_test)
pred_proba_xgb = pd.DataFrame(pred_proba_xgb)

### 결괏값 미세조정

In [None]:
predicted_test = copy.deepcopy(preds_xgb)
for i in range(len(preds_xgb)):
    if preds_xgb[i] != preds_lgbm[i]:
        if pred_proba_xgb.loc[i, pred_proba_xgb.loc[i, :].idxmax()] < pred_proba_lgbm.loc[i, pred_proba_lgbm.loc[i, :].idxmax()]-0.357:
            predicted_test[i] = preds_lgbm[i]

### ensemble - voting classifier 사용(제출되지는 않음)

In [None]:
from sklearn.ensemble import VotingClassifier
vot_clf = VotingClassifier(estimators = [('lgbm', lgbm_clf), ('xgb', xgb_clf)], voting='soft')
vot_clf.fit(x_train, y_train)

In [None]:
preds_vote = vot_clf.predict(x_test)
macro_f1(preds_vote, y_test)

In [None]:
xgb_clf.fit(df, target)
lgbm_clf.fit(df, target)

predicted_xgb = xgb_clf.predict(test)
predicted_lgbm = lgbm_clf.predict(test)

predict_proba_xgb = pd.DataFrame(xgb_clf.predict_proba(test))
predict_proba_lgbm = pd.DataFrame(lgbm_clf.predict_proba(test))

In [None]:
predicted_submission = copy.deepcopy(predicted_xgb)
for i in range(len(preds_xgb)):
    if predicted_xgb[i] != predicted_lgbm[i]:
        if predict_proba_xgb.loc[i, predict_proba_xgb.loc[i, :].idxmax()] < predict_proba_lgbm.loc[i, predict_proba_lgbm.loc[i, :].idxmax()]-0.517:
            predicted_submission[i] = preds_lgbm[i]

In [None]:
submission = pd.read_csv("sample_submission.csv")

submission['대출등급'] = predicted_submission

reversed_classes = {v: k for k, v in classes.items()}
submission['대출등급'].replace(reversed_classes, inplace=True)

In [None]:
submission = submission.set_index('ID')

In [None]:
submission.to_csv("submission_file_name.csv")