In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime
from collections import Counter
from glob import glob
import warnings
warnings.filterwarnings(action='ignore')

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score #, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
mpl.rcParams['axes.formatter.limits'] = [-10,10]

In [3]:
df = pd.read_csv('open/train.csv', encoding='utf-8')
df.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,,2.0,-6.0,1.0
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0


In [4]:
df.shape

(26457, 20)

In [5]:
#df = df.drop('index',axis=1)

In [6]:
df = df.fillna('unknown')

## credit 비율

In [7]:
credit = Counter(df.credit)
credit

Counter({1.0: 6267, 2.0: 16968, 0.0: 3222})

In [8]:
credit_1, credit_2, credit_0 = credit[1.0], credit[2.0], credit[0.0]
credit_1, credit_2, credit_0

(6267, 16968, 3222)

## credit별 gender 비율
- credit에 관계없이 여자가 두 배 정도 많음

In [9]:
gender_1 = Counter(df[df['credit']==1.0]['gender'])
gender_1['F']/credit_1, gender_1['M']/credit_1

(0.6733684378490505, 0.3266315621509494)

In [10]:
gender_2 = Counter(df[df['credit']==2.0]['gender'])
gender_2['F']/credit_2, gender_2['M']/credit_2

(0.6676685525695427, 0.33233144743045734)

In [11]:
gender_0 = Counter(df[df['credit']==0.0]['gender'])
gender_0['F']/credit_0, gender_0['M']/credit_0

(0.6666666666666666, 0.3333333333333333)

## credit별 car 비율

In [12]:
car_1 = Counter(df[df['credit']==1.0]['car'])
car_1['Y']/credit_1, car_1['N']/credit_1

(0.363331737673528, 0.636668262326472)

In [13]:
car_2 = Counter(df[df['credit']==2.0]['car'])
car_2['Y']/credit_2, car_2['N']/credit_2

(0.3848420556341348, 0.6151579443658651)

In [14]:
car_0 = Counter(df[df['credit']==0.0]['car'])
car_0['Y']/credit_0, car_0['N']/credit_0

(0.38485412787088763, 0.6151458721291123)

## credit별 reality 비율

In [15]:
reality_1 = Counter(df[df['credit']==1.0]['reality'])
reality_1['Y']/credit_1, reality_1['N']/credit_1

(0.691239827668741, 0.30876017233125896)

In [16]:
reality_2 = Counter(df[df['credit']==2.0]['reality'])
reality_2['Y']/credit_2, reality_2['N']/credit_2

(0.6683168316831684, 0.3316831683168317)

In [17]:
reality_0 = Counter(df[df['credit']==0.0]['reality'])
reality_0['Y']/credit_0, reality_0['N']/credit_0

(0.6697703289882061, 0.3302296710117939)

#### credit별 변수 구성요소 비율

In [18]:
df.child_num.unique()

array([ 0,  1,  2,  3,  4,  5, 14, 19,  7], dtype=int64)

In [19]:
a = Counter(df.child_num)
a

Counter({0: 18340, 1: 5386, 2: 2362, 3: 306, 4: 47, 5: 10, 14: 3, 19: 1, 7: 2})

In [20]:
a.values()

dict_values([18340, 5386, 2362, 306, 47, 10, 3, 1, 2])

In [21]:
a[0]

18340

In [22]:
df = df.drop('index', axis=1)

In [23]:
def ratio(df):
    raio_lst = []
    credit = Counter(df.credit)
    for col in df.columns:
        if col == 'credit':
            break
        if len(df[col].unique())>20:
            print('********', col)
            continue
        print('[', col, ']')
        for credit_num in [1.0, 2.0, 0.0] :
            print('  credit : ',credit_num, end ='    ')
            lst = []
            for idx in df[col].unique():
                cnt_dict = Counter(df[df['credit']==credit_num][col])
                result = cnt_dict[idx]/credit[credit_num]
                lst.append([idx, result])
                print('idx: ', idx, ' ', result, end=' ')
            print()

In [24]:
ratio(df)

[ gender ]
  credit :  1.0    idx:  F   0.6733684378490505 idx:  M   0.3266315621509494 
  credit :  2.0    idx:  F   0.6676685525695427 idx:  M   0.33233144743045734 
  credit :  0.0    idx:  F   0.6666666666666666 idx:  M   0.3333333333333333 
[ car ]
  credit :  1.0    idx:  N   0.636668262326472 idx:  Y   0.363331737673528 
  credit :  2.0    idx:  N   0.6151579443658651 idx:  Y   0.3848420556341348 
  credit :  0.0    idx:  N   0.6151458721291123 idx:  Y   0.38485412787088763 
[ reality ]
  credit :  1.0    idx:  N   0.30876017233125896 idx:  Y   0.691239827668741 
  credit :  2.0    idx:  N   0.3316831683168317 idx:  Y   0.6683168316831684 
  credit :  0.0    idx:  N   0.3302296710117939 idx:  Y   0.6697703289882061 
[ child_num ]
  credit :  1.0    idx:  0   0.6896441678634115 idx:  1   0.20951013243976385 idx:  2   0.08536779958512845 idx:  3   0.01260571246210308 idx:  4   0.0019147917663954045 idx:  5   0.0009573958831977022 idx:  14   0.0 idx:  19   0.0 idx:  7   0.0 
  cred

  credit :  2.0    idx:  2.0   0.5384842055634135 idx:  3.0   0.17291371994342292 idx:  4.0   0.08775341819896275 idx:  1.0   0.18776520509193775 idx:  5.0   0.011197548326261197 idx:  6.0   0.0014733616218764733 idx:  7.0   5.8934464875058934e-05 idx:  15.0   0.0001768033946251768 idx:  20.0   5.8934464875058934e-05 idx:  9.0   0.00011786892975011787 
  credit :  0.0    idx:  2.0   0.5338299193047796 idx:  3.0   0.1893234016139044 idx:  4.0   0.07790192427063936 idx:  1.0   0.1877715704531347 idx:  5.0   0.008379888268156424 idx:  6.0   0.0021725636250775914 idx:  7.0   0.0006207324643078833 idx:  15.0   0.0 idx:  20.0   0.0 idx:  9.0   0.0 
******** begin_month


In [25]:
# income_total, DAYS_BIRTH, DAYS_EMPLOYED, begin_month

In [26]:
df.begin_month.unique()

array([ -6.,  -5., -22., -37., -26., -18., -41., -53., -38., -40., -51.,
       -60.,  -2., -14.,  -7., -35.,  -4., -13., -57., -47., -33., -30.,
       -20.,  -8., -39., -21., -19., -24., -48., -12., -10., -42., -29.,
        -3., -23., -25.,  -1., -15., -32., -59., -54., -34.,   0., -27.,
       -45., -56., -46.,  -9., -44., -36., -43., -49., -11., -55., -58.,
       -28., -52., -17., -50., -16., -31.])

# lgbm

In [27]:
features = ['gender', 'car', 'reality', 'child_num', 'income_total', 'income_type',
       'edu_type', 'family_type', 'house_type', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'FLAG_MOBIL', 'work_phone', 'phone', 'email', 'occyp_type',
       'family_size', 'begin_month']

In [28]:
X = df[features]
y = df[['credit']]

In [29]:
X.shape, y.shape

((26457, 18), (26457, 1))

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=2021)

In [31]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train,y_train, test_size=0.25, random_state=2021)

In [32]:
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_test.shape, y_test.shape

((15873, 18), (15873, 1), (5292, 18), (5292, 1), (5292, 18), (5292, 1))

In [33]:
numerical_features = ['income_total', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'begin_month']
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

In [34]:
categorical_features = ['gender', 'car', 'reality', 'child_num', 'income_type',
       'edu_type', 'family_type', 'house_type',
       'FLAG_MOBIL', 'work_phone', 'phone', 'email', 'occyp_type',
       'family_size']
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [35]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

In [36]:
# def get_clf_eval(y_test, y_pred):
#     confusion = confusion_matrix(y_test, y_pred)
#     accuracy = accuracy_score(y_test, y_pred)
#     precision = precision_score(y_test, y_pred)
#     recall = recall_score(y_test, y_pred)
#     F1 = f1_score(y_test, y_pred)
#     AUC = roc_auc_score(y_test, y_pred)
#     print('오차행렬:\n', confusion)
#     print('\n정확도: {:.4f}'.format(accuracy))
#     print('정밀도: {:.4f}'.format(precision))
#     print('재현율: {:.4f}'.format(recall))
#     print('F1: {:.4f}'.format(F1))
#     print('AUC: {:.4f}'.format(AUC))

In [37]:
#X_train

In [38]:
from lightgbm import LGBMClassifier
 
# LGBMClassifier pipeline
pipe_lgbm = make_pipeline(preprocessor,LGBMClassifier())
 
# 파라미터 목록
param_grid = {'lgbmclassifier__objective':['multiclass'],
               'lgbmclassifier__learning_rate': [0.05, 0.1, 0.2], 
               'lgbmclassifier__max_depth': [1, 5, 10],
               'lgbmclassifier__min_child_weight': [3, 4, 5]}
 

gs_lgbm = GridSearchCV(estimator=pipe_lgbm,
                   param_grid=param_grid,
                   cv=5,
                   n_jobs=-1)
 
gs_lgbm.fit(X_train, y_train)

print("최적의 매개변수 조합: %s" %gs_lgbm.best_params_)
reg_lgbm = gs_lgbm.best_estimator_
 



최적의 매개변수 조합: {'lgbmclassifier__learning_rate': 0.2, 'lgbmclassifier__max_depth': 10, 'lgbmclassifier__min_child_weight': 4, 'lgbmclassifier__objective': 'multiclass'}


In [39]:
y_lgbm_pred = reg_lgbm.predict(X_test)

In [40]:
accuracy_score(y_test, y_lgbm_pred)

0.6965230536659108

In [41]:
y_lgbm_pred = reg_lgbm.predict_proba(X_test)

In [42]:
log_loss(y_test, y_lgbm_pred)

0.7671805702019729

# test data

In [44]:
X_unk = pd.read_csv('open/test.csv', encoding='utf-8')
X_unk = X_unk.drop(columns=['index'])
X_unk = X_unk.fillna('Unknown')

In [45]:
features = ['gender', 'car', 'reality', 'child_num', 'income_total', 'income_type',
       'edu_type', 'family_type', 'house_type', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'FLAG_MOBIL', 'work_phone', 'phone', 'email', 'occyp_type',
       'family_size', 'begin_month']

In [48]:
X_unk_test = X_unk[features]
#y_test = X_unk[['credit']]

In [49]:
numerical_features = ['income_total', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'begin_month']
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

In [50]:
categorical_features = ['gender', 'car', 'reality', 'child_num', 'income_type',
       'edu_type', 'family_type', 'house_type',
       'FLAG_MOBIL', 'work_phone', 'phone', 'email', 'occyp_type',
       'family_size']
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [51]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

In [52]:
curr_time = datetime.now().strftime('%y%m%d%H%M%S')

In [54]:
X_unk_raw = pd.read_csv('open/test.csv', encoding='utf-8')
l_idx = X_unk_raw['index']
submission = pd.DataFrame(np.insert(reg_lgbm.predict_proba(X_unk_test), 0, l_idx, axis=1), columns=['index', 0, 1, 2])
submission['index'] = submission['index'].astype('int')

In [None]:
#submission = reg_lgbm.predict_proba(X_test)

In [55]:
submission.to_csv(f'submission_{curr_time}.csv', index=False)