In [None]:
import os
import joblib
import datetime as dt
import pickle
from tqdm import tqdm
# import conf
import folium
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold
import japanize_matplotlib
import datetime
from scipy.spatial.distance import squareform
from scipy.spatial.distance import pdist
import gc
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
%load_ext autoreload
%autoreload 2


pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 600)


In [None]:
df_train = pd.read_csv('../data/train.csv', index_col=0).assign(type='train')
df_test = pd.read_csv('../data/test.csv', index_col=0).assign(type='test')


In [None]:
print(df_train.shape)
print(df_test.shape)

# EDA

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
def split_ymd(df, col):
    df[f'{col}_dt'] = pd.to_datetime(df[col], format='%d-%b-%y')
    df[f'{col}_year'] = df[f'{col}_dt'].dt.year
    df[f'{col}_month'] = df[f'{col}_dt'].dt.month
    df[f'{col}_day'] = df[f'{col}_dt'].dt.day
    
    return df


def preprocess(df):
    df[['RevLineCr', 'LowDoc']] = df[['RevLineCr', 'LowDoc']].replace({'Y':1, 'N':0})
    df = split_ymd(df, 'DisbursementDate')
    df = split_ymd(df, 'ApprovalDate')

    for col in ['DisbursementGross', 'GrAppv', 'SBA_Appv']:
        df[col] = df[col].str.replace('[\$,]', '', regex=True).astype(float).astype(int)
    
    return df

In [None]:
df_train['LowDoc'].unique()

In [None]:
df_train = preprocess(df_train)
df_test = preprocess(df_test)


In [None]:
df_train['MIS_Status'].hist()
df_train['MIS_Status'].mean()

In [None]:
# 特徴量の分布に差異はない
import seaborn as sns
tmp = pd.concat([df_train, df_test])

for col in df_train.columns:
    sns.histplot(x=col, data=tmp, hue='type', bins=10, alpha=0.5)
    plt.show()

In [None]:
df_train.isna().sum()

# 学習

In [None]:
df_train

In [None]:
df_train.columns

In [None]:
df_train.columns

features = ['Term', 'NoEmp', 'NewExist', 'CreateJob', 'RetainedJob',
            'FranchiseCode', 
            # 'RevLineCr', 'LowDoc',
            'Sector', 
            # 'ApprovalFY', 
            # 'City', 'State', 'BankState', 
            'DisbursementGross', 'GrAppv', 'SBA_Appv', 'UrbanRural', 
            'DisbursementDate_year', 'DisbursementDate_month', 'DisbursementDate_day',
            'ApprovalDate_year', 'ApprovalDate_month', 'ApprovalDate_day']

target = 'MIS_Status'

In [None]:
df_train[features]

In [None]:
df_train[features].info()

In [None]:
from sklearn.metrics import f1_score

def custom_f1_score(preds, train_data):
    y_true = train_data.get_label()
    y_pred = (preds > 0.5).astype(int)  # 2値分類の場合、閾値を設定して予測ラベルを取得
    score = f1_score(y_true, y_pred)
    return 'f1', score, True

In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
x_train = df_train[features]
y_train = df_train[target]

In [None]:

params= {
    'objective': 'binary',
    'metric':'binary_logloss',
    'learning_rate': 0.05, 
    # 'max_depth':10,
    'num_leaves': 200,
    'feature_fraction': 0.8,  # default = 1.0
    'bagging_freq': 1,        # default = 0
    'bagging_fraction': 0.8,  # default = 1.0
    'n_estimators': 10000,
    # 'importance_type': 'gain',
}

models = []
pred_oof = np.zeros(len(x_train))

for i, (train_index, valid_index) in enumerate(skf.split(x_train, y_train)):
    print(f"Fold {i}:start training")
    model = lgb.LGBMClassifier(**params)
    model.fit(x_train.loc[train_index], 
              y_train[train_index], 
              eval_set=[(x_train.loc[valid_index], y_train[valid_index])],  
              callbacks=[lgb.early_stopping(stopping_rounds=200,
                                            verbose=True), # early_stopping用コールバック関数
                         lgb.log_evaluation(100)], # コマンドライン出力用コールバック関数          
             )
    models.append(model)
    pred_oof[valid_index] = model.predict_proba(x_train.loc[valid_index])[:, 1]
    # pred_oof[valid_index] = model.predict_proba(x_train.loc[valid_index])[:, 1]

In [None]:
df_train['pred'] = pred_oof
df_train['pred'].hist()

In [None]:
from sklearn.metrics import f1_score

thrs = np.arange(0, 1, 0.001)
scores = []

for thr in np.arange(0, 1, 0.001):
    y_true = df_train[target]
    y_pred = (df_train['pred'] > thr).astype(int)  # 2値分類の場合、閾値を設定して予測ラベルを取得
    scores.append(f1_score(y_true, y_pred, average='macro'))
    

In [None]:
plt.plot(thrs, scores)

In [None]:
idx = np.argmax(scores)
thr = thrs[idx]
print(thr, scores[idx])

# 結果の解釈

In [None]:
importance = pd.DataFrame(model.feature_importances_, index=x_train.columns, columns=['importance']).sort_values('importance', ascending=False).head(20)
importance.plot.barh()

In [None]:
df_train[['pred_0', 'pred_1', 'pred_2']] = pred_oof
df_train

In [None]:
df_train[['pred_0', 'pred_1', 'pred_2']].hist()

In [None]:
df_train['health'].value_counts()

# 予測値付与

In [None]:
pred_test = np.zeros(len(df_test))
for model in models:
    pred_test += model.predict_proba(df_test[features])[:,1]/5
df_test['pred'] = pred_test
df_test['pred_class'] = (df_test['pred'] > thr).astype(int)

In [None]:
df_test['pred'].hist()


In [None]:
df_test['pred_class'].hist()

In [None]:
sample_submit = pd.read_csv('../data/sample_submission.csv', index_col=0, header=None) # 応募用サンプルファイル
sample_submit[1] = df_test['pred_class']
sample_submit.to_csv('../data/submit.csv', header=None)

# submit

In [None]:
!signate submit --competition-id=1337 ../data/submit.csv --note macro-f1