In [1]:
import pandas as pd
import lightgbm as lgb
import numpy as np

In [25]:
train_data = pd.read_csv('./input/train_public.csv')
train_internet = pd.read_csv('./input/train_internet.csv')
submit_example = pd.read_csv('./input/submit_example.csv')
test_public = pd.read_csv('./input/test_public.csv')

In [26]:
train_data.dtypes

loan_id                       int64
user_id                       int64
total_loan                  float64
year_of_loan                  int64
interest                    float64
monthly_payment             float64
class                        object
employer_type                object
industry                     object
work_year                    object
house_exist                   int64
censor_status                 int64
issue_date                   object
use                           int64
post_code                     int64
region                        int64
debt_loan_ratio             float64
del_in_18month                int64
scoring_low                 float64
scoring_high                float64
known_outstanding_loan        int64
known_dero                    int64
pub_dero_bankrup            float64
recircle_b                  float64
recircle_u                  float64
initial_list_status           int64
app_type                      int64
earlies_credit_mon          

In [27]:
work_year_dict = {
    '1 year': 1,
    '2 years': 2,
    '3 years': 3,
    '4 years': 4,
    '5 years': 5,
    '6 years': 6,
    '7 years': 7,
    '8 years': 8,
    '9 years': 9,
    '10+ years': 10,
}

train_data['work_year'] = train_data['work_year'].map(work_year_dict)
test_public['work_year'] = test_public['work_year'].map(work_year_dict)

In [28]:
class_dict = {
    'A': 1,
    'B': 2,
    'C': 3,
    'D': 4,
    'E': 5,
    'F': 6,
    'G': 7,
}

train_data['class'] = train_data['class'].map(class_dict)
test_public['class'] = test_public['class'].map(class_dict)

In [29]:
train_data['issue_date'] = pd.to_datetime(train_data['issue_date'])
test_public['issue_date'] = pd.to_datetime(test_public['issue_date'])

train_data['issue_date_month'] = train_data['issue_date'].dt.month
test_public['issue_date_month'] = train_data['issue_date'].dt.month

train_data['issue_date_dayofweek'] = train_data['issue_date'].dt.dayofweek
test_public['issue_date_dayofweek'] = train_data['issue_date'].dt.dayofweek

In [30]:
col_to_drop = ['issue_date', 'earlies_credit_mon']
train_data = train_data.drop(col_to_drop, axis=1)
test_public = test_public.drop(col_to_drop, axis=1)

In [31]:
cat_cols = ['employer_type', 'industry']

from sklearn.preprocessing import LabelEncoder
for col in cat_cols:
    lbl = LabelEncoder().fit(train_data[col])
    train_data[col] = lbl.transform(train_data[col])
    test_public[col] = lbl.transform(test_public[col])

In [45]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

def k_fold_serachParmaters(model,train_data, train_label, test_data):
    n_splits=5
    
    sk = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2020)
    pred_Test = np.zeros(len(test_data))
    
    auc_train, auc_val = 0, 0
    for tr_idx, val_idx in sk.split(train_data, train_label):
        x_train = train_data.iloc[tr_idx]
        y_train = train_label.iloc[tr_idx]
        x_val = train_data.iloc[val_idx]
        y_val = train_label.iloc[val_idx]

        model.fit(x_train, y_train, 
                  eval_set=[(x_val, y_val)], 
                  categorical_feature = cat_cols,
                 early_stopping_rounds=100,
                 verbose=False)

        pred_Test += model.predict_proba(test_data)[:, 1]/n_splits

        pred = model.predict(x_val)
        auc_val += roc_auc_score(y_val,pred)/n_splits
        
        pred = model.predict(x_train)
        auc_train += roc_auc_score(y_train, pred)/n_splits
        
        
    return auc_val, pred_Test

In [51]:
import warnings
warnings.filterwarnings("ignore")

score_tta = None
score_list = []

tta_fold = 20
for _ in range(tta_fold):
    clf = lgb.LGBMClassifier(
        num_leaves=np.random.randint(6, 10), min_child_samples= np.random.randint(2,5),
        max_depth=7,learning_rate=0.01,
        n_estimators=2050,n_jobs=-1)

    score, test_pred = k_fold_serachParmaters(clf,
                           train_data.drop(['loan_id', 'user_id', 'isDefault'], axis=1),
                           train_data['isDefault'],
                           test_public.drop(['loan_id', 'user_id',], axis=1),
                          )

    print(score)
    if score_tta is None:
        score_tta = test_pred/tta_fold
    else:
        score_tta += test_pred/tta_fold
    score_list.append(score)

0.6379297058384644
0.6378193432101231
0.6411947923481109
0.6379297058384644
0.639650980531848
0.6379297058384644
0.6397703258341599
0.6388898032496009
0.6388898032496009
0.6388898032496009
0.6411947923481109
0.6398875480086581
0.6399286984891315
0.6411947923481109
0.6388898032496009
0.639650980531848
0.6379297058384644
0.6379297058384644
0.6399286984891315
0.6397703258341599


In [53]:
test_public['isDefault'] = score_tta

In [60]:
test_public.rename({'loan_id': 'id'}, axis=1)[['id', 'isDefault']].to_csv('aaa.csv', index=None)