In [None]:
pip install -r ../requirements.txt

In [1]:
import numpy as np
import pandas as pd
import torch

from torch import nn
from pytorch_tabnet.tab_model  import TabNetClassifier
from pytorch_tabnet.metrics import Metric
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder

class F1_Score(Metric):
    def __init__(self):
        self._name = "f1"
        self._maximize = True

    def __call__(self, y_true, y_score):
        score = f1_score(y_true, (y_score[:, 1]>0.4)*1)
        return score

In [12]:
def add_code(df_, d_code, h_code, l_code):
    df_ = df_.copy()   

    # D Code
    df_['person_prefer_d_1_n'] = df_['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df_['person_prefer_d_1_s'] = df_['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df_['person_prefer_d_1_m'] = df_['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df_['person_prefer_d_1_l'] = df_['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df_['person_prefer_d_2_n'] = df_['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df_['person_prefer_d_2_s'] = df_['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df_['person_prefer_d_2_m'] = df_['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df_['person_prefer_d_2_l'] = df_['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df_['person_prefer_d_3_n'] = df_['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df_['person_prefer_d_3_s'] = df_['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df_['person_prefer_d_3_m'] = df_['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df_['person_prefer_d_3_l'] = df_['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df_['contents_attribute_d_n'] = df_['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df_['contents_attribute_d_s'] = df_['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df_['contents_attribute_d_m'] = df_['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df_['contents_attribute_d_l'] = df_['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    # H Code
    #df_['person_prefer_h_1_u'] = df_['person_prefer_h_1'].apply(lambda x: h_code[x]['속성 H 상위코드'])
    #df_['person_prefer_h_2_u'] = df_['person_prefer_h_2'].apply(lambda x: h_code[x]['속성 H 상위코드'])
    #df_['person_prefer_h_3_u'] = df_['person_prefer_h_3'].apply(lambda x: h_code[x]['속성 H 상위코드'])
    #df_['contents_attribute_h_u'] = df_['contents_attribute_h'].apply(lambda x: h_code[x]['속성 H 상위코드'])
    
    df_['contents_attribute_h_m'] = df_['contents_attribute_h'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
    df_['contents_attribute_h_l'] = df_['contents_attribute_h'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
    
    df_['person_prefer_h_1_m'] = df_['person_prefer_h_1'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
    df_['person_prefer_h_1_l'] = df_['person_prefer_h_1'].apply(lambda x: h_code[x]['속성 H 대분류코드'])

    df_['person_prefer_h_2_m'] = df_['person_prefer_h_2'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
    df_['person_prefer_h_2_l'] = df_['person_prefer_h_2'].apply(lambda x: h_code[x]['속성 H 대분류코드'])

    df_['person_prefer_h_3_m'] = df_['person_prefer_h_3'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
    df_['person_prefer_h_3_l'] = df_['person_prefer_h_3'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
    
    # L Code
    df_['contents_attribute_l_n'] = df_['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 세분류코드'])
    df_['contents_attribute_l_s'] = df_['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 소분류코드'])
    df_['contents_attribute_l_m'] = df_['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 중분류코드'])
    df_['contents_attribute_l_l'] = df_['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 대분류코드'])
    return df_

In [13]:
d_code = pd.read_csv('../data/dcode.csv', index_col=0).T.to_dict()
h_code = pd.read_csv('../data/hcode.csv', index_col=0).T.to_dict()
l_code = pd.read_csv('../data/lcode.csv', index_col=0).T.to_dict()

df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

train_df = add_code(df_train, d_code, h_code, l_code)
test_df = add_code(df_test, d_code, h_code, l_code)

#test = test_df.copy()

In [14]:
train_df['aux'] = 'train'
test_df['aux'] = 'test'
test_df['target'] = -1

columns = sorted(test_df.columns)
train_df = train_df[columns]
test_df = test_df[columns]

train_df_aux = train_df.append(test_df)

train_df_aux['count_person'] = train_df_aux.groupby('person_rn')['person_rn'].transform('count')
train_df_aux['count_content'] = train_df_aux.groupby('contents_rn')['contents_rn'].transform('count')
train_df_aux['count_person_content'] = train_df_aux.groupby('person_rn')['contents_rn'].transform('count')
train_df_aux['count_content_person'] = train_df_aux.groupby('contents_rn')['person_rn'].transform('count')

train_ = train_df_aux[train_df_aux['aux'] == 'train']
test_ = train_df_aux[train_df_aux['aux'] == 'test']

train_.drop(['aux'],axis=1,inplace=True)
test_.drop(['aux','target'],axis=1,inplace=True)

train = train_.copy()
test = test_.copy()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [15]:
#train_df = pd.read_csv('../data/train.csv')
#test_df = pd.read_csv('../data/test.csv')
sample = pd.read_csv('../data/sample_submission.csv')
#test = test_df.copy()

In [16]:
#train = train_df[train_df['contents_open_dt'].apply(lambda x: pd.Timestamp(x).month)<11].copy()
#val = train_df[train_df['contents_open_dt'].apply(lambda x: pd.Timestamp(x).month)==11].copy()
#train = train_df

for df in [train,test]:
    df['contents_open_dt'] = pd.to_datetime(df['contents_open_dt'])

In [17]:
for df in [train,test]:
    df['contents_open_dt'] = pd.to_datetime(df['contents_open_dt'])

for df in [train,test]:
    df.drop(['id','person_prefer_f','person_prefer_g','person_rn','contents_rn'],axis=1,inplace=True)
    #df.drop(['id','person_prefer_f','person_prefer_g'],axis=1,inplace=True)

cat_columns = [col for col in train.columns if 'match' not in col and col not in ['target','contents_open_dt']]

In [18]:
for df in [train,test]:
    df['dia'] = pd.DatetimeIndex(df['contents_open_dt']).day
    df['hora'] = pd.DatetimeIndex(df['contents_open_dt']).hour
    df['dayofweek'] = df['contents_open_dt'].dt.dayofweek

#cat_columns = [col for col in train.columns if 'match' not in col and col not in ['target','contents_open_dt']]

In [19]:
train_aux = train
train = train_aux[train_aux['contents_open_dt'].apply(lambda x: pd.Timestamp(x).month)<10].copy()
val = train_aux[train_aux['contents_open_dt'].apply(lambda x: pd.Timestamp(x).month)>=10].copy()

for df in [train,val,test]:
    df.drop(['contents_open_dt'],axis=1,inplace=True)

columns = sorted(test.columns)
train = train[columns+['target']]*1
val = val[columns+['target']]*1
test = test[columns]*1

train_aux = train_aux[columns+['target']]*1

In [20]:
print(train.shape)
print(val.shape)
print(train_aux.shape)
print(test.shape)

(411367, 64)
(90584, 64)
(501951, 64)
(46404, 63)


In [21]:
cat_idxs = []
cat_dims = []
for idx, col in enumerate(train.columns):
    if col in cat_columns: 
        le = LabelEncoder()
        le.fit(train_aux[col].values)
        le_dict = dict(zip(le.classes_, le.transform(le.classes_)))
        
        train[col] = train[col].apply(lambda x: le_dict.get(x, len(le_dict)))
        val[col] = val[col].apply(lambda x: le_dict.get(x, len(le_dict)))
        test[col] = test[col].apply(lambda x: le_dict.get(x, len(le_dict)))
        
        train_aux[col] = train_aux[col].apply(lambda x: le_dict.get(x, len(le_dict)))
        
        cat_idxs.append(idx)
        cat_dims.append(len(le_dict)+1)

X_train = train.drop('target',axis=1).values
y_train = train['target'].values
X_val = val.drop('target',axis=1).values
y_val = val['target'].values
X_test = test.values
eval_set = (X_val,y_val)

X_train_full = train_aux.drop('target',axis=1).values
y_train_full = train_aux['target'].values

In [39]:
clf = TabNetClassifier(seed = 1990,
                       n_steps=4,
                       cat_idxs=cat_idxs,
                       cat_dims=cat_dims,
                       cat_emb_dim=3,
                       optimizer_fn=torch.optim.AdamW, # Any optimizer works here
                       mask_type='entmax', # "sparsemax",entmax
                      )

Device used : cuda


In [None]:
clf.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    eval_name=['train', 'val'],
    eval_metric=['logloss','f1'],
    max_epochs=200 , patience=30,
    batch_size=256,
    virtual_batch_size=128,
    num_workers=1,
    drop_last=False,

)

epoch 0  | loss: 0.67443 | train_logloss: 0.64985 | train_f1: 0.67561 | val_logloss: 0.65263 | val_f1: 0.666   |  0:03:07s
epoch 1  | loss: 0.64545 | train_logloss: 0.63668 | train_f1: 0.69443 | val_logloss: 0.64301 | val_f1: 0.68275 |  0:06:17s
epoch 2  | loss: 0.63725 | train_logloss: 0.62853 | train_f1: 0.69888 | val_logloss: 0.63953 | val_f1: 0.68544 |  0:09:25s
epoch 3  | loss: 0.63444 | train_logloss: 0.62784 | train_f1: 0.70202 | val_logloss: 0.63956 | val_f1: 0.68696 |  0:12:33s
epoch 4  | loss: 0.63231 | train_logloss: 0.62732 | train_f1: 0.70187 | val_logloss: 0.6408  | val_f1: 0.68425 |  0:15:41s
epoch 5  | loss: 0.6307  | train_logloss: 0.62235 | train_f1: 0.7006  | val_logloss: 0.64289 | val_f1: 0.68061 |  0:18:50s
epoch 13 | loss: 0.62718 | train_logloss: 0.6188  | train_f1: 0.70742 | val_logloss: 0.63777 | val_f1: 0.68452 |  0:43:56s
epoch 14 | loss: 0.62711 | train_logloss: 0.62012 | train_f1: 0.70424 | val_logloss: 0.64114 | val_f1: 0.68358 |  0:47:01s
epoch 20 | loss:

In [36]:
preds = clf.predict_proba(X_val)

In [25]:
print(f1_score(y_val, (preds[:, 1]>0.2)*1))
print(f1_score(y_val, (preds[:, 1]>0.25)*1))
print(f1_score(y_val, (preds[:, 1]>0.3)*1))
print(f1_score(y_val, (preds[:, 1]>0.35)*1))
print(f1_score(y_val, (preds[:, 1]>0.4)*1))
print(f1_score(y_val, (preds[:, 1]>0.5)*1))

0.6745015325786775
0.6790874641855881
0.6831755509290175
0.6875242970066087
0.689239889900487
0.6496530857113048


In [37]:
print(f1_score(y_val, (preds[:, 1]>0.2)*1))
print(f1_score(y_val, (preds[:, 1]>0.25)*1))
print(f1_score(y_val, (preds[:, 1]>0.3)*1))
print(f1_score(y_val, (preds[:, 1]>0.35)*1))
print(f1_score(y_val, (preds[:, 1]>0.4)*1))
print(f1_score(y_val, (preds[:, 1]>0.5)*1))

0.6789866885048323
0.6823740497662779
0.6845401993462473
0.6860309573858206
0.6840804677711264
0.6535475865746486


In [None]:
clf.fit(
    X_train=X_train_full, 
    y_train=y_train_full,
    max_epochs=20,
    batch_size=256,
    patience=10,
    virtual_batch_size=128,
    num_workers=1,
    drop_last=False,

)

In [26]:
preds = clf.predict_proba(X_test)
preds = (preds[:,1]>0.4)*1

In [27]:
sample['target'] = preds
sample.to_csv('sub_tabnet_26.csv',index=False)

In [28]:
sub1 = pd.read_csv('sub_tabnet_26.csv')
sub2 = pd.read_csv('sub_tabnet_23.csv')
sub3 = pd.read_csv('sub_tabnet_25.csv')
#sub4 = pd.read_csv('sub_tabnet_10.csv')
#sub5 = pd.read_csv('sub_tabnet_9.csv')

In [29]:
sub1['target_final'] = round((sub1['target'] + sub2['target'] + sub3['target'])/3,0).astype(int)

In [30]:
sub1

Unnamed: 0,id,target,target_final
0,0,0,0
1,1,0,0
2,2,0,1
3,3,0,1
4,4,1,1
...,...,...,...
46399,46399,1,1
46400,46400,1,1
46401,46401,1,1
46402,46402,1,1


In [31]:
sub1.drop(['target'],axis = 1,inplace = True)

In [32]:
sub1

Unnamed: 0,id,target_final
0,0,0
1,1,0
2,2,1
3,3,1
4,4,1
...,...,...
46399,46399,1
46400,46400,1
46401,46401,1
46402,46402,1


In [33]:
sub1.columns = ['id','target']

In [34]:
sub1.to_csv('sub_tabnet_27.csv',index=False)