In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
from catboost.datasets import amazon

train, test = amazon()

In [3]:
print("Train shape : {}, Test shape : {}".format(train.shape, test.shape))

Train shape : (32769, 10), Test shape : (58921, 10)


In [9]:
train.head(10)

Unnamed: 0,ACTION,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
0,1,39353,85475,117961,118300,123472,117905,117906,290919,117908
1,1,17183,1540,117961,118343,123125,118536,118536,308574,118539
2,1,36724,14457,118219,118220,117884,117879,267952,19721,117880
3,1,36135,5396,117961,118343,119993,118321,240983,290919,118322
4,1,42680,5905,117929,117930,119569,119323,123932,19793,119325
5,0,45333,14561,117951,117952,118008,118568,118568,19721,118570
6,1,25993,17227,117961,118343,123476,118980,301534,118295,118982
7,1,19666,4209,117961,117969,118910,126820,269034,118638,126822
8,1,31246,783,117961,118413,120584,128230,302830,4673,128231
9,1,78766,56683,118079,118080,117878,117879,304519,19721,117880


In [5]:
test.head(5)

Unnamed: 0,id,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
0,1,78766,72734,118079,118080,117878,117879,118177,19721,117880
1,2,40644,4378,117961,118327,118507,118863,122008,118398,118865
2,3,75443,2395,117961,118300,119488,118172,301534,249618,118175
3,4,43219,19986,117961,118225,118403,120773,136187,118960,120774
4,5,42093,50015,117961,118343,119598,118422,300136,118424,118425


In [6]:
train.apply(lambda x: len(x.unique()))

ACTION                 2
RESOURCE            7518
MGR_ID              4243
ROLE_ROLLUP_1        128
ROLE_ROLLUP_2        177
ROLE_DEPTNAME        449
ROLE_TITLE           343
ROLE_FAMILY_DESC    2358
ROLE_FAMILY           67
ROLE_CODE            343
dtype: int64

In [11]:
# 분석 
# train set에서 ACTION은 0 또는 1로 이루어짐
# ROLE_TITLE과 ROLE_CODE는 특성이 동일한지 확인 필요
# 특성을 확인하고, training을 위한 columns name set을 만들어야함 -> col4train

In [12]:
col1 = "ROLE_TITLE"
col2 = "ROLE_CODE"

In [13]:
pair = len(train.groupby([col1, col2]).size())
single = len(train.groupby(col1).size())

In [15]:
[col1, col2, pair, single]

['ROLE_TITLE', 'ROLE_CODE', 343, 343]

In [16]:
# col1과 col2를 동시에 groupby한 결과와 col1만으로 집계하였을때 결과가 동일함 -> 동일한 특성을 갖음

In [18]:
col4train = [x for x in train.columns if x != 'ACTION']    # y =train['ACTION']

In [20]:
col4train = [x for x in col4train if x != 'ROLE_TITLE']    # col1, col2 둘 중 하나 제거

In [21]:
col4train

['RESOURCE',
 'MGR_ID',
 'ROLE_ROLLUP_1',
 'ROLE_ROLLUP_2',
 'ROLE_DEPTNAME',
 'ROLE_FAMILY_DESC',
 'ROLE_FAMILY',
 'ROLE_CODE']

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

In [24]:
ohe = OneHotEncoder(sparse=True, dtype=np.float32, handle_unknown='ignore')

In [25]:
X = ohe.fit_transform(train[col4train])
y = train['ACTION'].values

In [28]:
model = LogisticRegression(penalty='l2'
                           ,C=1.0
                           ,solver='liblinear'
                           ,n_jobs=2
                           ,random_state = 432
                           ,max_iter=1000
                        )

In [29]:
from sklearn.model_selection import cross_validate       # model_selection 모델의 성능평가

In [35]:
import sklearn
sklearn.metrics.SCORERS.keys()           # scoring 종류

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])

In [43]:
stats = cross_validate(model, X, y, cv=5, n_jobs=2, return_train_score=True, scoring='roc_auc', groups=None )
stats = pd.DataFrame(stats)
stats.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fit_time,5.0,0.163046,0.023104,0.131065,0.149951,0.171423,0.171462,0.191328
score_time,5.0,0.003228,0.004421,0.0,0.0,0.0,0.008052,0.00809
test_score,5.0,0.863918,0.01,0.850939,0.855367,0.869491,0.871883,0.87191
train_score,5.0,0.974355,0.00079,0.973129,0.97421,0.974344,0.974946,0.975144


In [45]:
# test_score = 0.8639...  ->  더 좋은 점수를 얻어보자!
# LB Score 확인하기

In [50]:
target = "ACTION"

In [51]:
X = ohe.fit_transform(train[col4train])
y = train[target].values
X_te = ohe.transform(test[col4train])

In [55]:
model.fit(X,y)
predictions = model.predict_proba(X_te)
predictions = predictions[:,1]          # 두 클래스(0 or 1) 중 하나의 확률만 뽑아냄

In [58]:
submit = pd.DataFrame()
submit['ID'] = test['id']
submit[target] = predictions

In [60]:
submit.to_csv('submission.csv', index=False)

In [61]:
os.getcwd()

'C:\\Users\\dmfgu\\Downloads'

In [76]:
for x in os.listdir('C:/Users/dmfgu/Downloads'):             # os 이용하여 csv로 끝나는 파일 이름 검색
    if x.endswith('csv'):
        print(x)

diabetes.csv
submission.csv
