In [None]:
pip install -r ../requirements.txt

In [1]:
import numpy as np
import pandas as pd
import torch

from torch import nn
from pytorch_tabnet.tab_model  import TabNetClassifier
from pytorch_tabnet.metrics import Metric
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder

class F1_Score(Metric):
    def __init__(self):
        self._name = "f1"
        self._maximize = True

    def __call__(self, y_true, y_score):
        score = f1_score(y_true, (y_score[:, 1]>0.4)*1)
        return score

In [2]:
def add_code(df_, d_code, h_code, l_code):
    df_ = df_.copy()   

    # D Code
    df_['person_prefer_d_1_n'] = df_['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df_['person_prefer_d_1_s'] = df_['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df_['person_prefer_d_1_m'] = df_['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df_['person_prefer_d_1_l'] = df_['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df_['person_prefer_d_2_n'] = df_['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df_['person_prefer_d_2_s'] = df_['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df_['person_prefer_d_2_m'] = df_['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df_['person_prefer_d_2_l'] = df_['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df_['person_prefer_d_3_n'] = df_['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df_['person_prefer_d_3_s'] = df_['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df_['person_prefer_d_3_m'] = df_['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df_['person_prefer_d_3_l'] = df_['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df_['contents_attribute_d_n'] = df_['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df_['contents_attribute_d_s'] = df_['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df_['contents_attribute_d_m'] = df_['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df_['contents_attribute_d_l'] = df_['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    # H Code
    #df_['person_prefer_h_1_u'] = df_['person_prefer_h_1'].apply(lambda x: h_code[x]['속성 H 상위코드'])
    #df_['person_prefer_h_2_u'] = df_['person_prefer_h_2'].apply(lambda x: h_code[x]['속성 H 상위코드'])
    #df_['person_prefer_h_3_u'] = df_['person_prefer_h_3'].apply(lambda x: h_code[x]['속성 H 상위코드'])
    #df_['contents_attribute_h_u'] = df_['contents_attribute_h'].apply(lambda x: h_code[x]['속성 H 상위코드'])
    
    df_['contents_attribute_h_m'] = df_['contents_attribute_h'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
    df_['contents_attribute_h_l'] = df_['contents_attribute_h'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
    
    df_['person_prefer_h_1_m'] = df_['person_prefer_h_1'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
    df_['person_prefer_h_1_l'] = df_['person_prefer_h_1'].apply(lambda x: h_code[x]['속성 H 대분류코드'])

    df_['person_prefer_h_2_m'] = df_['person_prefer_h_2'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
    df_['person_prefer_h_2_l'] = df_['person_prefer_h_2'].apply(lambda x: h_code[x]['속성 H 대분류코드'])

    df_['person_prefer_h_3_m'] = df_['person_prefer_h_3'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
    df_['person_prefer_h_3_l'] = df_['person_prefer_h_3'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
    
    # L Code
    df_['contents_attribute_l_n'] = df_['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 세분류코드'])
    df_['contents_attribute_l_s'] = df_['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 소분류코드'])
    df_['contents_attribute_l_m'] = df_['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 중분류코드'])
    df_['contents_attribute_l_l'] = df_['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 대분류코드'])
    return df_

In [3]:
d_code = pd.read_csv('../data/dcode.csv', index_col=0).T.to_dict()
h_code = pd.read_csv('../data/hcode.csv', index_col=0).T.to_dict()
l_code = pd.read_csv('../data/lcode.csv', index_col=0).T.to_dict()

df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

train_df = add_code(df_train, d_code, h_code, l_code)
test_df = add_code(df_test, d_code, h_code, l_code)

#test = test_df.copy()

In [4]:
train_df['aux'] = 'train'
test_df['aux'] = 'test'
test_df['target'] = -1

columns = sorted(test_df.columns)
train_df = train_df[columns]
test_df = test_df[columns]

train_df_aux = train_df.append(test_df)

train_df_aux['count_person'] = train_df_aux.groupby('person_rn')['person_rn'].transform('count')
train_df_aux['count_content'] = train_df_aux.groupby('contents_rn')['contents_rn'].transform('count')
train_df_aux['count_person_content'] = train_df_aux.groupby('person_rn')['contents_rn'].transform('count')
train_df_aux['count_content_person'] = train_df_aux.groupby('contents_rn')['person_rn'].transform('count')

train_ = train_df_aux[train_df_aux['aux'] == 'train']
test_ = train_df_aux[train_df_aux['aux'] == 'test']

train_.drop(['aux'],axis=1,inplace=True)
test_.drop(['aux','target'],axis=1,inplace=True)

train = train_.copy()
test = test_.copy()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [5]:
#train_df = pd.read_csv('../data/train.csv')
#test_df = pd.read_csv('../data/test.csv')
sample = pd.read_csv('../data/sample_submission.csv')
#test = test_df.copy()

In [6]:
#train = train_df[train_df['contents_open_dt'].apply(lambda x: pd.Timestamp(x).month)<11].copy()
#val = train_df[train_df['contents_open_dt'].apply(lambda x: pd.Timestamp(x).month)==11].copy()
#train = train_df

for df in [train,test]:
    df['contents_open_dt'] = pd.to_datetime(df['contents_open_dt'])

In [7]:
for df in [train,test]:
    df['contents_open_dt'] = pd.to_datetime(df['contents_open_dt'])

for df in [train,test]:
    df.drop(['id','person_prefer_f','person_prefer_g','person_rn','contents_rn'],axis=1,inplace=True)
    #df.drop(['id','person_prefer_f','person_prefer_g'],axis=1,inplace=True)

cat_columns = [col for col in train.columns if 'match' not in col and col not in ['target','contents_open_dt']]

In [8]:
for df in [train,test]:
    df['dia'] = pd.DatetimeIndex(df['contents_open_dt']).day
    df['hora'] = pd.DatetimeIndex(df['contents_open_dt']).hour
    df['dayofweek'] = df['contents_open_dt'].dt.dayofweek

#cat_columns = [col for col in train.columns if 'match' not in col and col not in ['target','contents_open_dt']]

In [9]:
train_aux = train
train = train_aux[train_aux['contents_open_dt'].apply(lambda x: pd.Timestamp(x).month)<10].copy()
val = train_aux[train_aux['contents_open_dt'].apply(lambda x: pd.Timestamp(x).month)>=10].copy()

for df in [train,val,test]:
    df.drop(['contents_open_dt'],axis=1,inplace=True)

columns = sorted(test.columns)
train = train[columns+['target']]*1
val = val[columns+['target']]*1
test = test[columns]*1

train_aux = train_aux[columns+['target']]*1

In [10]:
print(train.shape)
print(val.shape)
print(train_aux.shape)
print(test.shape)

(411367, 64)
(90584, 64)
(501951, 64)
(46404, 63)


In [11]:
cat_idxs = []
cat_dims = []
for idx, col in enumerate(train.columns):
    if col in cat_columns: 
        le = LabelEncoder()
        le.fit(train_aux[col].values)
        le_dict = dict(zip(le.classes_, le.transform(le.classes_)))
        
        train[col] = train[col].apply(lambda x: le_dict.get(x, len(le_dict)))
        val[col] = val[col].apply(lambda x: le_dict.get(x, len(le_dict)))
        test[col] = test[col].apply(lambda x: le_dict.get(x, len(le_dict)))
        
        train_aux[col] = train_aux[col].apply(lambda x: le_dict.get(x, len(le_dict)))
        
        cat_idxs.append(idx)
        cat_dims.append(len(le_dict)+1)

X_train = train.drop('target',axis=1).values
y_train = train['target'].values
X_val = val.drop('target',axis=1).values
y_val = val['target'].values
X_test = test.values
eval_set = (X_val,y_val)

X_train_full = train_aux.drop('target',axis=1).values
y_train_full = train_aux['target'].values

In [14]:
clf = TabNetClassifier(seed = 1990,
                       n_steps=6,
                       cat_idxs=cat_idxs,
                       cat_dims=cat_dims,
                       cat_emb_dim=3,
                       optimizer_fn=torch.optim.AdamW, # Any optimizer works here
                       mask_type='entmax', # "sparsemax",entmax
                      )

Device used : cuda


In [15]:
clf.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    eval_name=['train', 'val'],
    eval_metric=['logloss','f1'],
    max_epochs=200 , patience=30,
    batch_size=256,
    virtual_batch_size=256,
    num_workers=1,
    drop_last=False,

)

epoch 0  | loss: 0.67536 | train_logloss: 0.65356 | train_f1: 0.68219 | val_logloss: 0.65755 | val_f1: 0.67381 |  0:01:38s
epoch 1  | loss: 0.64451 | train_logloss: 0.63643 | train_f1: 0.69562 | val_logloss: 0.64299 | val_f1: 0.68329 |  0:03:17s
epoch 2  | loss: 0.63499 | train_logloss: 0.62764 | train_f1: 0.69944 | val_logloss: 0.63811 | val_f1: 0.68418 |  0:04:56s
epoch 3  | loss: 0.62999 | train_logloss: 0.62455 | train_f1: 0.7025  | val_logloss: 0.63786 | val_f1: 0.68569 |  0:06:35s
epoch 4  | loss: 0.62615 | train_logloss: 0.62062 | train_f1: 0.70601 | val_logloss: 0.64089 | val_f1: 0.68773 |  0:08:13s
epoch 5  | loss: 0.6229  | train_logloss: 0.61219 | train_f1: 0.704   | val_logloss: 0.63855 | val_f1: 0.67968 |  0:09:52s
epoch 6  | loss: 0.62027 | train_logloss: 0.61509 | train_f1: 0.70913 | val_logloss: 0.63901 | val_f1: 0.6886  |  0:11:31s
epoch 7  | loss: 0.61826 | train_logloss: 0.60502 | train_f1: 0.71026 | val_logloss: 0.64066 | val_f1: 0.68003 |  0:13:11s
epoch 8  | loss:

In [16]:
preds = clf.predict_proba(X_val)

In [17]:
print(f1_score(y_val, (preds[:, 1]>0.2)*1))
print(f1_score(y_val, (preds[:, 1]>0.25)*1))
print(f1_score(y_val, (preds[:, 1]>0.3)*1))
print(f1_score(y_val, (preds[:, 1]>0.35)*1))
print(f1_score(y_val, (preds[:, 1]>0.4)*1))
print(f1_score(y_val, (preds[:, 1]>0.5)*1))

0.6723496543518532
0.6769467885428271
0.6819084094252821
0.6868764821737208
0.6885998729082821
0.6395352596919693


In [None]:
clf.fit(
    X_train=X_train_full, 
    y_train=y_train_full,
    max_epochs=20,
    batch_size=256,
    patience=10,
    virtual_batch_size=128,
    num_workers=1,
    drop_last=False,

)

In [18]:
preds = clf.predict_proba(X_test)
preds = (preds[:,1]>0.4)*1

In [19]:
#file to submit
sample['target'] = preds
sample.to_csv('sub.csv',index=False)

In [21]:
#Ensamble

sub1 = pd.read_csv('sub_tabnet_26.csv')
sub2 = pd.read_csv('sub_tabnet_23.csv')
sub3 = pd.read_csv('sub_tabnet_25.csv')
sub4 = pd.read_csv('sub_tabnet_28.csv')
#sub5 = pd.read_csv('sub_tabnet_9.csv')

sub1['target_final'] = round((sub1['target'] + sub2['target'] + sub3['target']+ sub4['target'])/4,0).astype(int)
sub1.drop(['target'],axis = 1,inplace = True)

sub1.drop(['target'],axis = 1,inplace = True)
sub1.columns = ['id','target']
sub1.to_csv('sub_tabnet_29.csv',index=False)