### Amazon.com - Employee Access Challenge

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
from catboost.datasets import amazon

train, test = amazon()

In [3]:
print("Train shape : {}, Test shape : {}".format(train.shape, test.shape))

Train shape : (32769, 10), Test shape : (58921, 10)


In [4]:
train.head(5)

Unnamed: 0,ACTION,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
0,1,39353,85475,117961,118300,123472,117905,117906,290919,117908
1,1,17183,1540,117961,118343,123125,118536,118536,308574,118539
2,1,36724,14457,118219,118220,117884,117879,267952,19721,117880
3,1,36135,5396,117961,118343,119993,118321,240983,290919,118322
4,1,42680,5905,117929,117930,119569,119323,123932,19793,119325


In [5]:
test.head(5)

Unnamed: 0,id,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
0,1,78766,72734,118079,118080,117878,117879,118177,19721,117880
1,2,40644,4378,117961,118327,118507,118863,122008,118398,118865
2,3,75443,2395,117961,118300,119488,118172,301534,249618,118175
3,4,43219,19986,117961,118225,118403,120773,136187,118960,120774
4,5,42093,50015,117961,118343,119598,118422,300136,118424,118425


In [6]:
train.apply(lambda x: len(x.unique()))

ACTION                 2
RESOURCE            7518
MGR_ID              4243
ROLE_ROLLUP_1        128
ROLE_ROLLUP_2        177
ROLE_DEPTNAME        449
ROLE_TITLE           343
ROLE_FAMILY_DESC    2358
ROLE_FAMILY           67
ROLE_CODE            343
dtype: int64

In [7]:
import itertools

In [8]:
target = "ACTION"

In [9]:
col4train = [x for x in train.columns if x != target]

col1 = "ROLE_TITLE"
col2 = "ROLE_CODE"

In [10]:
pair = len(train.groupby([col1, col2]).size())
single = len(train.groupby(col1).size())

In [11]:
print(col1, col2, pair, single)

ROLE_TITLE ROLE_CODE 343 343


In [12]:
train.groupby([col1, col2]).size()

ROLE_TITLE  ROLE_CODE
117879      117880       1256
117885      117888        806
117896      117898        165
117899      117900        240
117905      117908       3583
                         ... 
297560      119817          1
299559      163732         11
307024      118332        467
310825      121395          1
311867      118479          4
Length: 343, dtype: int64

In [13]:
col4train = [x for x in col4train if x != 'ROLE_TITLE']   # 학습을 위한 column set = col4train = columns for training

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

In [44]:
ohe = OneHotEncoder(sparse=True, dtype=np.float32, handle_unknown='ignore')

In [45]:
[col4train]

[['RESOURCE',
  'MGR_ID',
  'ROLE_ROLLUP_1',
  'ROLE_ROLLUP_2',
  'ROLE_DEPTNAME',
  'ROLE_FAMILY_DESC',
  'ROLE_FAMILY',
  'ROLE_CODE']]

In [46]:
X = ohe.fit_transform(train[col4train]) # trainset에서 col4train에 해당하는 col만 ohe적용
y = train['ACTION'].values              # train의 ACTION col에서 값만 배열로 추출

In [47]:
y

array([1, 1, 1, ..., 1, 1, 1])

In [48]:
from sklearn.model_selection import cross_validate

In [49]:
model = LogisticRegression(                       # 로지스틱 회귀에서 패널티를 l2로 설정
                penalty = 'l2',
                C = 1.0,
                fit_intercept = True,
                random_state = 432,
                solver = 'liblinear',
                max_iter = 1000
)
stats = cross_validate(model, X, y, groups = None, scoring = 'roc_auc',
                       cv = 5, n_jobs = 2, return_train_score = True)
stats = pd.DataFrame(stats)
stats.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fit_time,5.0,0.363284,0.066085,0.247955,0.379761,0.380265,0.392887,0.41555
score_time,5.0,0.007061,0.000572,0.006198,0.006981,0.007045,0.007318,0.007762
test_score,5.0,0.863918,0.01,0.850939,0.855367,0.869491,0.871884,0.87191
train_score,5.0,0.974356,0.00079,0.973129,0.974217,0.974344,0.974946,0.975144


In [51]:
X = ohe.fit_transform(train[col4train])      # train dataset을 정규분포로 만들기 위해 평균, 표준편차 계산하여(fit)+정규화(transform)
y = train["ACTION"].values
X_te = ohe.transform(test[col4train])

In [52]:
model.fit(X,y)
predictions = model.predict_proba(X_te)[:,1]

submit = pd.DataFrame()
submit["ID"] = test["id"]
submit["ACTION"] = predictions

submit.to_csv("submission.csv",index = False)