In [6]:
#pip install kaggler
#!pip install pycaret

In [10]:
import lightgbm as lgb
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from pprint import pprint
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
import seaborn as sns
from tensorflow import keras
import warnings

from kaggler.preprocessing import LabelEncoder
from kaggler.model import AutoLGB

import kaggler
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
warnings.simplefilter('ignore')

In [11]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
#결측치 처리
train.fillna('NaN',inplace = True)
test.fillna('NaN',inplace = True)

In [15]:
target_col = 'credit'
df = pd.concat([train.drop(target_col, axis=1), test], axis=0)

# Label Encoding

In [16]:
cat_cols = [x for x in df.columns if df[x].dtype == 'object']
num_cols = [x for x in df.columns if x not in cat_cols + [target_col]]
feature_cols = num_cols + cat_cols
print(len(feature_cols), len(cat_cols), len(num_cols))

19 8 11


In [17]:
lbe = LabelEncoder(min_obs=10)
df[cat_cols] = lbe.fit_transform(df[cat_cols])
df[cat_cols].head()

Unnamed: 0,gender,car,reality,income_type,edu_type,family_type,house_type,occyp_type
0,0,0,1,1,1,0,2,0
1,0,0,0,1,0,2,0,1
2,1,1,0,0,1,0,0,4
3,0,0,0,1,0,0,0,3
4,0,1,0,3,1,0,0,4


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36457 entries, 0 to 9999
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          36457 non-null  int64  
 1   gender         36457 non-null  int64  
 2   car            36457 non-null  int64  
 3   reality        36457 non-null  int64  
 4   child_num      36457 non-null  int64  
 5   income_total   36457 non-null  float64
 6   income_type    36457 non-null  int64  
 7   edu_type       36457 non-null  int64  
 8   family_type    36457 non-null  int64  
 9   house_type     36457 non-null  int64  
 10  DAYS_BIRTH     36457 non-null  int64  
 11  DAYS_EMPLOYED  36457 non-null  int64  
 12  FLAG_MOBIL     36457 non-null  int64  
 13  work_phone     36457 non-null  int64  
 14  phone          36457 non-null  int64  
 15  email          36457 non-null  int64  
 16  occyp_type     36457 non-null  int64  
 17  family_size    36457 non-null  float64
 18  begin_m

In [19]:
df.nunique()

index            36457
gender               2
car                  2
reality              2
child_num            9
income_total       265
income_type          5
edu_type             5
family_type          5
house_type           6
DAYS_BIRTH        7183
DAYS_EMPLOYED     3640
FLAG_MOBIL           1
work_phone           2
phone                2
email                2
occyp_type          19
family_size         10
begin_month         61
dtype: int64

# AutoEncoder

In [20]:
feature_name = 'ae'
algo_name = 'lgb'
model_name = f'{algo_name}_{feature_name}'

In [23]:
#AutoEncoder 준비
encoding_dim = 64

def get_model(encoding_dim, dropout=.2):
    num_dim = len(num_cols)
    num_input = keras.layers.Input((num_dim,), name='num_input')
    cat_inputs = []
    cat_embs = []
    emb_dims = 0
    for col in cat_cols:
        cat_input = keras.layers.Input((1,), name=f'{col}_input')
        emb_dim = max(8, int(np.log2(1 + df[col].nunique()) * 4))
        cat_emb = keras.layers.Embedding(input_dim=df[col].max() + 1, output_dim=emb_dim)(cat_input)
        cat_emb = keras.layers.Dropout(dropout)(cat_emb)
        cat_emb = keras.layers.Reshape((emb_dim,))(cat_emb)

        cat_inputs.append(cat_input)
        cat_embs.append(cat_emb)
        emb_dims += emb_dim

    merged_inputs = keras.layers.Concatenate()([num_input] + cat_embs)

    encoded = keras.layers.Dense(encoding_dim * 3, activation='relu')(merged_inputs)
    encoded = keras.layers.Dropout(dropout)(encoded)
    encoded = keras.layers.Dense(encoding_dim * 2, activation='relu')(encoded)
    encoded = keras.layers.Dropout(dropout)(encoded)    
    encoded = keras.layers.Dense(encoding_dim, activation='relu')(encoded)
    
    decoded = keras.layers.Dense(encoding_dim * 2, activation='relu')(encoded)
    decoded = keras.layers.Dropout(dropout)(decoded)
    decoded = keras.layers.Dense(encoding_dim * 3, activation='relu')(decoded)
    decoded = keras.layers.Dropout(dropout)(decoded)    
    decoded = keras.layers.Dense(num_dim + emb_dims, activation='linear')(encoded)

    encoder = keras.Model([num_input] + cat_inputs, encoded)
    ae = keras.Model([num_input] + cat_inputs, decoded)
    ae.add_loss(keras.losses.mean_squared_error(merged_inputs, decoded))
    ae.compile(optimizer='adam')
    return ae, encoder

In [22]:
ae, encoder = get_model(encoding_dim)
ae.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
gender_input (InputLayer)       [(None, 1)]          0                                            
__________________________________________________________________________________________________
car_input (InputLayer)          [(None, 1)]          0                                            
__________________________________________________________________________________________________
reality_input (InputLayer)      [(None, 1)]          0                                            
__________________________________________________________________________________________________
income_type_input (InputLayer)  [(None, 1)]          0                                            
____________________________________________________________________________________________

In [24]:
inputs = [df[num_cols].values] + [df[x].values for x in cat_cols]
ae.fit(inputs, inputs,
      epochs=100,
      batch_size=16384,
      shuffle=True,
      validation_split=.2)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100


Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x7fb4191c8a30>

In [26]:
feature_file = f'{feature_name}.csv'
predict_val_file = f'{model_name}.val.txt'
predict_tst_file = f'{model_name}.tst.txt'
submission_file = f'{model_name}.sub.csv'


In [27]:
encoding = encoder.predict(inputs)
print(encoding.shape)
np.savetxt(feature_file, encoding, fmt='%.6f', delimiter=',')

(36457, 64)


# Model training

In [30]:
y = train[target_col]
n_trn = train.shape[0]

df_enc = pd.concat((df[feature_cols], pd.DataFrame(encoding, columns=[f'enc_{x}' for x in range(encoding_dim)])), axis=1)

X = df_enc.iloc[:n_trn]
X_tst = df_enc.iloc[n_trn:]
print(y.shape, X.shape, X_tst.shape)

InvalidIndexError: Reindexing only valid with uniquely valued Index objects