In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
from catboost.datasets import amazon

In [3]:
train, test = amazon()

In [4]:
train.isnull().sum()

ACTION              0
RESOURCE            0
MGR_ID              0
ROLE_ROLLUP_1       0
ROLE_ROLLUP_2       0
ROLE_DEPTNAME       0
ROLE_TITLE          0
ROLE_FAMILY_DESC    0
ROLE_FAMILY         0
ROLE_CODE           0
dtype: int64

In [5]:
test.isnull().sum()

id                  0
RESOURCE            0
MGR_ID              0
ROLE_ROLLUP_1       0
ROLE_ROLLUP_2       0
ROLE_DEPTNAME       0
ROLE_TITLE          0
ROLE_FAMILY_DESC    0
ROLE_FAMILY         0
ROLE_CODE           0
dtype: int64

In [6]:
train.shape

(32769, 10)

In [7]:
test.shape

(58921, 10)

In [16]:
train.apply(lambda x : len(x.unique()))  # apply 함수: df에 적용되지 않은 함수를 적용시키기 위함 ?_? (더 알아보기)

ACTION                 2
RESOURCE            7518
MGR_ID              4243
ROLE_ROLLUP_1        128
ROLE_ROLLUP_2        177
ROLE_DEPTNAME        449
ROLE_TITLE           343
ROLE_FAMILY_DESC    2358
ROLE_FAMILY           67
ROLE_CODE            343
dtype: int64

In [25]:
col4train = [x for x in train.columns if x not in ['ACTION', 'ROLE_TITLE']]

In [26]:
col4train

['RESOURCE',
 'MGR_ID',
 'ROLE_ROLLUP_1',
 'ROLE_ROLLUP_2',
 'ROLE_DEPTNAME',
 'ROLE_FAMILY_DESC',
 'ROLE_FAMILY',
 'ROLE_CODE']

In [28]:
y = train['ACTION'].values

# Unsupervised categorical encodings

In [33]:
from sklearn.ensemble import ExtraTreesClassifier     # 분류 모델에서 가장 많이 쓰임(랜덤성이 강하여 가장 일반화가 잘되는 모델)
from sklearn.model_selection import StratifiedKFold   # K-Fold와 차이점 중요!!
# K-Fold는 단순히 indexing 방식으로 자르는 방식 / Stratified는 train, val의 비율을 동일하게 자르는 방식
from sklearn.model_selection import cross_validate    # 모델의 성능 검증 (overfitting 확인)

In [34]:
# returns model instance
def get_model():
    params = {
        "n_estimators":300,
        "n_jobs":3,
        "random_state":5436
    }
    return ExtraTreesClassifier(**params)

# validate model on given dataset and report CV score
def Validate_model(model, data):
    skf = StratifiedKFold(n_split=5, random_state=4141, shuffle=True)
    stats = cross_validate(model, data[0], data[1]            # data[0], data[1] 을 X, y로 두는 이유??? (확필)
                          ,cv=skf ,n_jobs=2, scoring='roc_auc', return_train_score=True)
    stats = pd.DataFrame(stats)
    return stats.describe().transpose()

# transforms given train and test datasets using provided function,
def transform_dataset(train, test, func, func_params = {}):
    dataset = pd.concat([train, test], ignore_index=True)
    dataset = func(dataset, **func_params)                   # transform train, test set using provided function
    if isinstance(dataset, pd.DataFrame):                    # dataset의 object 명칭 여부에 따라 출력해줌
        new_train = dataset.iloc[:train.shape[0],:].reset_index(drop=True)         # dataset이 pd.DataFrame일 경우 실행
        new_test = dataset.iloc[train.shape[0]:,:].reset_index(drop=True)          # split dataset
    else:
        new_train = dataset[:train.shape[0]]
        new_test = dataset[train.shape[0]:]
    return new_train, new_test

### Label Encoding

In [35]:
MJTCP = 32292
# for each columns in dataset creates N column with random integers
def assign_rnd_integer(dataset, number_of_times = 5, seed = MJTCP):
    new_dataset = pd.DataFrame()
    np.random.seed(seed)
    for c in dataset.columns:
        for i in range(number_of_times):
            col_name = c+"_"+str(i)
            unique_vals = dataset[c].unique()
            labels = np.array(list(range(len(unique_vals))))
            np.random.shuffle(labels)
            mapping = pd.DataFrame({c: unique_vals, col_name: labels})
            new_dataset[col_name] = (dataset[[c]]
                                    .merge(mapping, on = c, how = 'left')[col_name]
                                    ).values
    return new_dataset