In [2]:
import pandas as pd
import numpy as np
import os

In [3]:
from catboost.datasets import amazon

In [4]:
train, test = amazon()

In [5]:
train.isnull().sum()

ACTION              0
RESOURCE            0
MGR_ID              0
ROLE_ROLLUP_1       0
ROLE_ROLLUP_2       0
ROLE_DEPTNAME       0
ROLE_TITLE          0
ROLE_FAMILY_DESC    0
ROLE_FAMILY         0
ROLE_CODE           0
dtype: int64

In [6]:
test.isnull().sum()

id                  0
RESOURCE            0
MGR_ID              0
ROLE_ROLLUP_1       0
ROLE_ROLLUP_2       0
ROLE_DEPTNAME       0
ROLE_TITLE          0
ROLE_FAMILY_DESC    0
ROLE_FAMILY         0
ROLE_CODE           0
dtype: int64

In [7]:
train.shape

(32769, 10)

In [8]:
test.shape

(58921, 10)

In [9]:
train.apply(lambda x : len(x.unique()))  # apply 함수: df에 적용되지 않은 함수를 적용시키기 위함 ?_? (더 알아보기)

ACTION                 2
RESOURCE            7518
MGR_ID              4243
ROLE_ROLLUP_1        128
ROLE_ROLLUP_2        177
ROLE_DEPTNAME        449
ROLE_TITLE           343
ROLE_FAMILY_DESC    2358
ROLE_FAMILY           67
ROLE_CODE            343
dtype: int64

In [10]:
col4train = [x for x in train.columns if x not in ['ACTION', 'ROLE_TITLE']]

In [11]:
col4train

['RESOURCE',
 'MGR_ID',
 'ROLE_ROLLUP_1',
 'ROLE_ROLLUP_2',
 'ROLE_DEPTNAME',
 'ROLE_FAMILY_DESC',
 'ROLE_FAMILY',
 'ROLE_CODE']

In [12]:
y = train['ACTION'].values

# Unsupervised categorical encodings
#### train, test set을 모두 사용

In [13]:
from sklearn.ensemble import ExtraTreesClassifier     # 분류 모델에서 가장 많이 쓰임(랜덤성이 강하여 가장 일반화가 잘되는 모델)
from sklearn.model_selection import StratifiedKFold   # K-Fold와 차이점 중요!!
# K-Fold는 단순히 indexing 방식으로 자르는 방식 / Stratified는 train, val의 비율을 동일하게 자르는 방식
from sklearn.model_selection import cross_validate    # 모델의 성능 검증 (overfitting 확인)

In [14]:
# returns model instance
def get_model():
    params = {
        "n_estimators":300,
        "n_jobs":3,
        "random_state":5436
    }
    return ExtraTreesClassifier(**params)

# validate model on given dataset and report CV score
def validate_model(model, data):
    skf = StratifiedKFold(n_splits=5, random_state=4141, shuffle=True)
    stats = cross_validate(model, data[0], data[1]            # data[0], data[1] 을 X, y로 두는 이유??? (확필)
                          ,cv=skf ,n_jobs=2, scoring='roc_auc', return_train_score=True)
    stats = pd.DataFrame(stats)
    return stats.describe().transpose()

# transforms given train and test datasets using provided function,
def transform_dataset(train, test, func, func_params = {}):
    dataset = pd.concat([train, test], ignore_index=True)
    dataset = func(dataset, **func_params)                   # transform train, test set using provided function
    if isinstance(dataset, pd.DataFrame):                    # dataset의 object 명칭 여부에 따라 출력해줌
        new_train = dataset.iloc[:train.shape[0],:].reset_index(drop=True)         # dataset이 pd.DataFrame일 경우 실행
        new_test = dataset.iloc[train.shape[0]:,:].reset_index(drop=True)          # split dataset
    else:
        new_train = dataset[:train.shape[0]]
        new_test = dataset[train.shape[0]:]
    return new_train, new_test

### Label Encoding (알파벳 순으로 숫자를 할당)

In [15]:
MJTCP = 32292
# for each columns in dataset creates N column with random integers
def assign_rnd_integer(dataset, number_of_times = 5, seed = MJTCP):
    new_dataset = pd.DataFrame()
    np.random.seed(seed)
    for c in dataset.columns:
        for i in range(number_of_times):
            col_name = c+"_"+str(i)                      # str(integer) : 정수를 문자열로 변환
            unique_vals = dataset[c].unique()
            labels = np.array(list(range(len(unique_vals))))
            np.random.shuffle(labels)
            mapping = pd.DataFrame({c: unique_vals, col_name: labels})
            new_dataset[col_name] = (dataset[[c]]
                                    .merge(mapping, on = c, how = 'left')[col_name]   # on = 기준 열 이름
                                    ).values
    return new_dataset

In [16]:
new_train, new_test = transform_dataset(
            train[col4train], test[col4train],
            assign_rnd_integer, {"number_of_times" : 5}
)
print(new_train.shape, new_test.shape)            # train[col4train] -> 8 col -> 0:5 * 8 = 40 col (new)

(32769, 40) (58921, 40)


In [17]:
new_train.head(5)

Unnamed: 0,RESOURCE_0,RESOURCE_1,RESOURCE_2,RESOURCE_3,RESOURCE_4,MGR_ID_0,MGR_ID_1,MGR_ID_2,MGR_ID_3,MGR_ID_4,...,ROLE_FAMILY_0,ROLE_FAMILY_1,ROLE_FAMILY_2,ROLE_FAMILY_3,ROLE_FAMILY_4,ROLE_CODE_0,ROLE_CODE_1,ROLE_CODE_2,ROLE_CODE_3,ROLE_CODE_4
0,4389,3561,5237,5701,3584,1064,3615,2251,911,1006,...,60,62,32,59,16,10,51,262,36,188
1,4111,6450,7308,5447,6616,3007,177,4687,1125,336,...,9,27,0,3,64,352,106,130,327,76
2,4009,2880,2476,1732,6819,4465,2100,4860,3517,4115,...,34,51,50,14,29,277,258,232,326,45
3,3402,4180,1831,1607,2508,798,2017,3961,698,4490,...,60,62,32,59,16,106,187,306,212,17
4,4312,5342,5151,2891,174,4429,1504,791,2071,622,...,64,5,15,50,25,346,2,134,103,50


In [18]:
new_test.head(5)

Unnamed: 0,RESOURCE_0,RESOURCE_1,RESOURCE_2,RESOURCE_3,RESOURCE_4,MGR_ID_0,MGR_ID_1,MGR_ID_2,MGR_ID_3,MGR_ID_4,...,ROLE_FAMILY_0,ROLE_FAMILY_1,ROLE_FAMILY_2,ROLE_FAMILY_3,ROLE_FAMILY_4,ROLE_CODE_0,ROLE_CODE_1,ROLE_CODE_2,ROLE_CODE_3,ROLE_CODE_4
0,5948,1689,5824,4743,1211,2456,2692,3358,3579,697,...,34,51,50,14,29,277,258,232,326,45
1,6736,2854,4636,4331,959,1655,3320,736,703,1473,...,32,13,39,16,48,218,272,74,308,158
2,5827,2469,1284,6159,1205,1836,4707,4355,3318,3301,...,21,28,7,54,6,273,66,26,22,128
3,3919,6009,615,685,5014,1866,3312,2262,228,2712,...,50,2,21,15,19,318,31,325,161,304
4,4471,4811,1058,6106,647,2132,3833,653,2697,2192,...,4,64,38,36,31,81,344,326,165,349


In [19]:
new_train.values

array([[4389, 3561, 5237, ...,  262,   36,  188],
       [4111, 6450, 7308, ...,  130,  327,   76],
       [4009, 2880, 2476, ...,  232,  326,   45],
       ...,
       [3599, 6819, 7436, ...,  151,  261,   37],
       [3795, 3724, 6035, ...,  157,  343,   89],
       [4220, 2658, 5414, ...,   88,  279,  230]])

In [20]:
validate_model(
    model = get_model(),
    data = [new_train.values, y]
)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fit_time,5.0,15.058661,5.940882,7.053441,13.116007,13.195264,20.748003,21.180588
score_time,5.0,0.611386,0.190607,0.346356,0.517147,0.61797,0.743207,0.832251
test_score,5.0,0.857485,0.008047,0.847689,0.853171,0.855,0.86524,0.866324
train_score,5.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


### One-Hot encoding      
#### Label encoding 보다 성능은 좋지만 running time, N(dimention)이 너무 커지는 문제점

In [24]:
from sklearn.preprocessing import OneHotEncoder
# transforms given dataset to OHE representation
def one_hot(dataset):
    ohe = OneHotEncoder(sparse=True, dtype=np.float32, handle_unknown='ignore')
    return ohe.fit_transform(dataset.values)

In [25]:
new_train, new_test = transform_dataset(
    train[col4train], test[col4train], 
    one_hot)
print(new_train.shape, new_test.shape)

(32769, 16600) (58921, 16600)


In [None]:
validate_model(
        model = get_model(),
        data = [new_train, y]
)                                    # the results are good, but running time is bigger too

### SVD encoding
#### 고차원 벡터의 가장 중요한 정보를 유지하면서 저차원, 조밀한 벡터로 압축하고자 하는 방식

In [29]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [30]:
def extract_col_interaction(dataset, col1, col2, tfidf = True):
    data = dataset.groupby([col1])[col2].agg(lambda x: " ".join(list([str(y) for y in x])))
    if tfidf:
        vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(" "))
    else:
        vectorizer = CountVectorizer(tokenizer=lambda x: x.split(" "))
    
    data_X = vectorizer.fit_transform(data)
    dim_red = TruncatedSVD(n_components=1, random_state = 5115)
    data_X = dim_red.fit_transform(data_X)
    
    result = pd.DataFrame()
    result[col1] = data.index.values
    result[col1+"_{}_svd".format(col2)] = data_X.ravel()
    return result

In [31]:
import itertools
def get_col_interactions_svd(dataset, tfidf = True):
    new_dataset = pd.DataFrame()
    for col1,col2 in itertools.permutations(dataset.columns, 2):
        data = extract_col_interaction(dataset, col1,col2, tfidf)
        col_name = [x for x in data.columns if "svd" in x][0]
        new_dataset[col_name] = dataset[[col1]].merge(data, on = col1, how = 'left')[col_name]
    return new_dataset

In [32]:
new_train, new_test = transform_dataset(
    train[col4train], test[col4train], 
    get_col_interactions_svd
)

In [33]:
print(new_train.shape, new_test.shape)

(32769, 56) (58921, 56)


In [34]:
new_train.head(5)

Unnamed: 0,RESOURCE_MGR_ID_svd,RESOURCE_ROLE_ROLLUP_1_svd,RESOURCE_ROLE_ROLLUP_2_svd,RESOURCE_ROLE_DEPTNAME_svd,RESOURCE_ROLE_FAMILY_DESC_svd,RESOURCE_ROLE_FAMILY_svd,RESOURCE_ROLE_CODE_svd,MGR_ID_RESOURCE_svd,MGR_ID_ROLE_ROLLUP_1_svd,MGR_ID_ROLE_ROLLUP_2_svd,...,ROLE_FAMILY_ROLE_DEPTNAME_svd,ROLE_FAMILY_ROLE_FAMILY_DESC_svd,ROLE_FAMILY_ROLE_CODE_svd,ROLE_CODE_RESOURCE_svd,ROLE_CODE_MGR_ID_svd,ROLE_CODE_ROLE_ROLLUP_1_svd,ROLE_CODE_ROLE_ROLLUP_2_svd,ROLE_CODE_ROLE_DEPTNAME_svd,ROLE_CODE_ROLE_FAMILY_DESC_svd,ROLE_CODE_ROLE_FAMILY_svd
0,0.015059,0.999236,0.869578,0.008674,0.695,0.846882,0.713359,0.034007,0.999988,0.964151,...,0.625506,0.790561,-0.069049,0.518124,5.6e-05,0.988656,0.933793,0.082077,0.940899,-0.003581
1,0.034197,0.982219,0.95253,0.082501,0.180704,0.223276,0.19672,0.174024,0.999988,0.265149,...,0.635695,0.001405,0.021511,0.483528,0.000877,0.996758,0.726202,0.03802,-0.000341,-0.000446
2,0.000674,0.001712,0.006027,0.26188,0.001021,0.010063,0.004702,0.006119,5.562812e-09,-8e-06,...,0.262357,0.016452,0.033778,0.09315,-0.000294,0.006333,0.010736,0.529856,-0.001061,0.000406
3,0.028655,0.999236,0.934787,0.012435,0.083169,0.994862,0.663811,0.089637,0.999988,0.265149,...,0.625506,0.790561,-0.069049,0.578121,2.1e-05,0.996215,0.955166,0.079606,0.952335,-0.003581
4,0.000827,0.482659,0.111446,0.114995,0.01028,0.064558,0.052313,0.009358,4.516656e-05,1.5e-05,...,0.0626,0.011422,-0.070611,0.053252,-0.000292,0.006631,0.010132,0.506337,0.00028,-0.000696


### Frequency encoding

In [35]:
def get_freq_encoding(dataset):
    new_dataset = pd.DataFrame()
    for c in dataset.columns:
        data = dataset.groupby([c]).size().reset_index()
        new_dataset[c+"_freq"] = dataset[[c]].merge(data, on = c, how = "left")[0]
    return new_dataset

In [46]:
train.groupby(['ROLE_CODE']).size().reset_index()

Unnamed: 0,ROLE_CODE,0
0,117880,1256
1,117888,806
2,117898,165
3,117900,240
4,117908,3583
...,...,...
338,254396,3
339,258436,7
340,266863,1
341,268610,1


In [49]:
train[['ROLE_CODE']].merge(train[col4train], on='ROLE_CODE', how = "left")

Unnamed: 0,ROLE_CODE,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_FAMILY_DESC,ROLE_FAMILY
0,117908,39353,85475,117961,118300,123472,117906,290919
1,117908,15672,111936,117961,118300,118783,240983,290919
2,117908,77385,14829,117961,118052,119986,117906,290919
3,117908,34687,815,117961,118300,123719,117906,290919
4,117908,73753,70062,117961,118386,118746,117906,290919
...,...,...,...,...,...,...,...,...
45691012,118570,37651,13262,118219,118220,118221,268194,19721
45691013,118570,25993,22356,117926,118124,117920,122142,19721
45691014,118570,41470,14289,118602,118603,117941,118568,19721
45691015,118570,20293,27915,117926,117927,117920,281735,19721


In [50]:
train[col4train]

Unnamed: 0,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
0,39353,85475,117961,118300,123472,117906,290919,117908
1,17183,1540,117961,118343,123125,118536,308574,118539
2,36724,14457,118219,118220,117884,267952,19721,117880
3,36135,5396,117961,118343,119993,240983,290919,118322
4,42680,5905,117929,117930,119569,123932,19793,119325
...,...,...,...,...,...,...,...,...
32764,23497,16971,117961,118300,119993,240983,290919,118322
32765,25139,311198,91261,118026,122392,173805,249618,121145
32766,34924,28805,117961,118327,120299,152038,118612,124924
32767,80574,55643,118256,118257,117945,280788,292795,119082


In [36]:
new_train, new_test = transform_dataset(
    train[col4train], test[col4train], 
    get_freq_encoding
)

In [37]:
print(new_train.shape, new_test.shape)
new_train.head(5)

(32769, 8) (58921, 8)


Unnamed: 0,RESOURCE_freq,MGR_ID_freq,ROLE_ROLLUP_1_freq,ROLE_ROLLUP_2_freq,ROLE_DEPTNAME_freq,ROLE_FAMILY_DESC_freq,ROLE_FAMILY_freq,ROLE_CODE_freq
0,7,145,59065,12155,180,17996,28861,9569
1,93,34,59065,10920,406,29,3506,213
2,8,7,518,518,1645,92,7768,3838
3,2,153,59065,10920,494,3244,28861,12082
4,28,18,815,396,143,41,945,187


In [38]:
validate_model(
    model = get_model(), 
    data = [new_train.values, y]
)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fit_time,5.0,5.353141,1.283965,3.101795,5.64047,5.685828,6.138362,6.199249
score_time,5.0,0.453246,0.109453,0.272243,0.455692,0.457902,0.538057,0.542338
test_score,5.0,0.818272,0.011721,0.800603,0.817661,0.819162,0.82045,0.833485
train_score,5.0,0.999883,1.4e-05,0.999868,0.999868,0.999888,0.999894,0.999898


In [51]:
new_train1, new_test1 = transform_dataset(
    train[col4train], test[col4train], get_freq_encoding
)
new_train2, new_test2 = transform_dataset(
    train[col4train], test[col4train], get_col_interactions_svd
)
new_train3, new_test3 = transform_dataset(
    train[col4train], test[col4train], 
    assign_rnd_integer, {"number_of_times":10}
)

In [52]:
new_train = pd.concat([new_train1, new_train2, new_train3], axis = 1)
new_test = pd.concat([new_test1, new_test2, new_test3], axis = 1)
print(new_train.shape, new_test.shape)

(32769, 144) (58921, 144)


In [53]:
validate_model(
    model = get_model(), 
    data = [new_train.values, y]
)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fit_time,5.0,35.302648,14.394599,13.894794,32.640936,32.67119,48.511643,48.794678
score_time,5.0,0.688592,0.256107,0.303241,0.629804,0.664667,0.910429,0.934818
test_score,5.0,0.873939,0.009381,0.860284,0.86953,0.877402,0.877532,0.884949
train_score,5.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
