In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

import os, time, datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, roc_curve, auc
import lightgbm as lgb
import xgboost as xgb

In [None]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

class SigirPreprocess():
    
    def __init__(self, text_data_path):
        self.text_data_path = text_data_path
        self.train = None
        self.dict_code_to_id = {}
        self.dict_id_to_code = {}
        self.list_tags = {}
        self.sentences = []
        self.labels = []
        self.text_col = None
        self.X_test = None
        
    def prepare_data(self ):
        catalog_eng= pd.read_csv(self.text_data_path+"data/catalog_english_taxonomy.tsv",sep="\t")
        X_train= pd.read_csv(self.text_data_path+"data/X_train.tsv",sep="\t")
        Y_train= pd.read_csv(self.text_data_path+"data/Y_train.tsv",sep="\t")
        
        self.list_tags = list(Y_train['Prdtypecode'].unique())
        for i,tag in enumerate(self.list_tags):
            self.dict_code_to_id[tag] = i 
            self.dict_id_to_code[i]=tag
        print(self.dict_code_to_id)
            
        Y_train['labels']=Y_train['Prdtypecode'].map(self.dict_code_to_id)
        train=pd.merge(left=X_train,right=Y_train,
               how='left',left_on=['Integer_id','Image_id','Product_id'],
               right_on=['Integer_id','Image_id','Product_id'])
        prod_map=pd.Series(catalog_eng['Top level category'].values,
                           index=catalog_eng['Prdtypecode']).to_dict()

        train['product'] = train['Prdtypecode'].map(prod_map)
        train['title_len']=train['Title'].progress_apply(lambda x : len(x.split()) if pd.notna(x) else 0)
        train['desc_len']=train['Description'].progress_apply(lambda x : len(x.split()) if pd.notna(x) else 0)
        train['title_desc_len']=train['title_len'] + train['desc_len']
        train.loc[train['Description'].isnull(), 'Description'] = " "
        train['title_desc'] = train['Title'] + " " + train['Description']
        
        self.train = train
        
    def get_sentences(self, text_col, remove_null_rows=False):
        self.text_col = text_col
        if remove_null_rows==True:
            new_train = self.train[self.train[text_col].notnull()]

        else:
            new_train = self.train.copy()
            
        self.sentences = new_train[text_col].values
        self.labels = new_train['labels'].values
    
    def prepare_test(self, text_col, test_data_path, phase=1):
        X_test=pd.read_csv(test_data_path+f"data/x_test_task1_phase{phase}.tsv",sep="\t")
        X_test.loc[X_test['Description'].isnull(), 'Description'] = " "
        X_test['title_desc'] = X_test['Title'] + " " + X_test['Description']
        self.X_test = X_test
        self.test_sentences = X_test[text_col].values
 

In [None]:
text_col = 'title_desc'
val_size = 0.1
random_state=2020
num_class = 27
do_gridsearch = False

In [None]:
kwargs = {'add_logits':['cam', 'fla']}


cam_path = '/../input/camembert-vec-256m768-10ep/'
flau_path = '/../input/flaubertlogits2107/' 
res_path = '/../input/resnextfinal/'
cms_path = '/../input/crossmodal-v0/'
vca_path = '/../input/vec-concat-9093/'
vca_path_phase2 = '/../input/predictions-test-phase2-vec-fusion/'
aem_path = '/../input/addition-ensemble-latest/'


val_logits_path = {'cam':cam_path + 'validation_set_softmax_logits.npy',
              'fla':flau_path + 'validation_set_softmax_logits.npy',
              'res':res_path + 'Valid_resnext50_32x4d_phase1_softmax_logits.npy',
                'vca':vca_path + 'softmax_logits_val_9093.npy',
                  'aem':aem_path + 'softmax_logits_val_add.npy'}

test_logits_path_phase1 = {'cam':cam_path+f'X_test_phase1_softmax_logits.npy',
              'fla':flau_path + f'X_test_phase1_softmax_logits.npy', 
              'res':res_path + f'Test_resnext50_32x4d_phase1_softmax_logits.npy',
                'vca':vca_path + f'softmax_logits_test_9093.npy'}

test_logits_path_phase2 = {'cam':cam_path+f'X_test_phase2_softmax_logits.npy',
                  'fla':flau_path + f'X_test_phase2_softmax_logits.npy', 
                  'res':res_path + f'Test_resnext50_32x4d_phase2_softmax_logits.npy',
                    'vca':vca_path_phase2 + f'softmax_logits_test_phase2_9093.npy'}
                           



In [None]:
## Get valdation dataset from original train dataset
Preprocess = SigirPreprocess("/../input/textphase1/")
Preprocess.prepare_data()
Preprocess.get_sentences(text_col, True)

full_data = Preprocess.train
labels = Preprocess.labels
index = full_data.Integer_id


tr_index, val_index, tr_labels, val_labels = train_test_split(index, labels,
                                                    stratify=labels,
                                                    random_state=random_state, 
                                                    test_size=val_size)

train_data = full_data.loc[tr_index, :]
train_data.reset_index(inplace=True, drop=True)
val_data = full_data.loc[val_index, :]
val_data.reset_index(inplace=True, drop=True)

full_data.loc[val_index, 'sample'] = 'val'
full_data['sample'].fillna('train', inplace=True)

In [None]:
def preparelogits_df(logit_paths, df=None, val_labels=None, **kwargs):
    ### Prepare and combine Logits data with original validation dataset
    logits_dict = {}
    dfs_dict = {}
    for key, logit_path in logit_paths.items():
        logits_dict[key] = np.load(logit_path)
        
        dfs_dict[key] = pd.DataFrame(logits_dict[key], 
                                     columns=[key + "_" + str(i) for i in range(1,28)])
        print("Shape of logit arrays: {}", logits_dict[key].shape)
        
    if kwargs['add_logits']:
        if len(kwargs['add_logits'])>0:
            add_str = '_'.join(kwargs['add_logits'])
            logits_dict[add_str] = logits_dict[kwargs['add_logits'][0]]
            for k in kwargs['add_logits'][1:]:
                logits_dict[add_str] += logits_dict[k]
            logits_dict[add_str] = logits_dict[add_str]/len(kwargs['add_logits'])
            dfs_dict[add_str] = pd.DataFrame(logits_dict[add_str], 
                                     columns=[add_str + "_" + str(i) for i in range(1,28)])
            print("Shape of logit arrays: {}", logits_dict[add_str].shape)


    
    if type(val_labels) == np.ndarray:
        for key,logits in logits_dict.items():
            print("""Validation F1 scores for {} logits: {} """.format(key, 
                f1_score(val_labels, np.argmax(logits, axis=1), average='macro')))
            
    

    df = pd.concat([df] + list(dfs_dict.values()), axis=1)
    
    return df

In [None]:
val_data = preparelogits_df(val_logits_path, df=val_data, 
                            val_labels=val_labels, **kwargs)

# Model Data Prep

In [None]:
df_log = val_data.copy()

probas_cols = ["fla_" + str(i) for i in range(1,28)] + ["cam_" + str(i) for i in range(1,28)] +\
["res_" + str(i) for i in range(1,28)] \
+ ["vca_" + str(i) for i in range(1,28)] \

X = df_log[probas_cols]
y = df_log['labels'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=random_state)


In [None]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
n_HP_points_to_test = 100


param_test ={'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
#              "bagging_fraction" : [0.5, 0.6, 0.7, 0.8, 0.9],
#              "feature_fraction":[0.5, 0.6, 0.7, 0.8, 0.9]
            }




fit_params={
            "early_stopping_rounds":100, 
            "eval_metric" : 'multi_logloss', 
            "eval_set" : [(X_test,y_test)],
            'eval_names': ['valid'],
            #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
            'verbose': 100,
            'categorical_feature': 'auto'}


clf = lgb.LGBMClassifier(num_iteration=1000, max_depth=-1, random_state=314, silent=True,
                         metric='multi_logloss', n_jobs=4, early_stopping_rounds=100,
                         num_class=num_class, objective= "multiclass")
gs = RandomizedSearchCV(
    estimator=clf, param_distributions=param_test, 
    n_iter=n_HP_points_to_test,
    cv=3,
    refit=True,
    random_state=314,
    verbose=True)

if do_gridsearch==True:
    gs.fit(X_train, y_train, **fit_params)
    print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))

In [None]:
# opt_parameters = gs.best_params_
opt_parameters = {'colsample_bytree': 0.5284213741879101, 'min_child_samples': 125, 
         'min_child_weight': 10.0, 'num_leaves': 22, 
         'reg_alpha': 0.1, 'reg_lambda': 20, 'subsample': 0.3080033455431848} 


# Model Training

In [None]:
### Run lightgbm to get weights for different class logits

t0 = time.time()

model_met = 'fit' #'xgb'#'train' #fit

params = {
          "objective" : "multiclass",
          "num_class" : num_class,
          "num_leaves" : 60,
          "max_depth": -1,
          "learning_rate" : 0.01,
          "bagging_fraction" : 0.9,  # subsample
          "feature_fraction" : 0.9,  # colsample_bytree
          "bagging_freq" : 5,        # subsample_freq
          "bagging_seed" : 2018,
          "verbosity" : -1 }

lgtrain, lgval = lgb.Dataset(X_train, y_train), lgb.Dataset(X_test, y_test)

if model_met == 'train':
    params.update(opt_parameters)
    params.update(fit_params)
    
    lgbmodel = lgb.train(params, lgtrain, valid_sets=[lgtrain, lgval], 
                         num_iterations = 1000, metric= 'multi_logloss')
    train_logits = lgbmodel.predict(X_train) 
    test_logits = lgbmodel.predict(X_test)

    train_pred = np.argmax(train_logits, axis=1) 
    test_pred = np.argmax(test_logits, axis=1) 
elif model_met == 'xgb':
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtrain.save_binary('xgb_train.buffer')
    dtest = xgb.DMatrix(X_test, label=y_test)
    
    num_round = 200
    xgb_param = {'max_depth': 5, 'eta': 0.1, 'seed':2020, 'verbosity':1,
                 'objective': 'multi:softmax', 'num_class':num_class}
    xgb_param['nthread'] = 4
    xgb_param['eval_metric'] = 'mlogloss'
    evallist = [(dtest, 'eval'), (dtrain, 'train')]
    bst = xgb.train(xgb_param, dtrain, num_round, evallist
                    , early_stopping_rounds=10
                   )
    
    train_logits = bst.predict(xgb.DMatrix(X_train), ntree_limit=bst.best_ntree_limit) 
    test_logits = bst.predict(xgb.DMatrix(X_test), ntree_limit=bst.best_ntree_limit)

    train_pred = train_logits 
    test_pred = test_logits 
    
else:

    lgbmodel = lgb.LGBMClassifier(**clf.get_params())
    #set optimal parameters
    lgbmodel.set_params(**opt_parameters)
    lgbmodel.fit(X_train, y_train, **fit_params)
    
    train_logits = lgbmodel.predict(X_train) 
    test_logits = lgbmodel.predict(X_test)

    train_pred = train_logits 
    test_pred = test_logits 
    
print("Validation F1: {} and Training F1: {} ".format(
    f1_score(y_test, test_pred, average='macro'), 
    f1_score(y_train, train_pred, average='macro')))

if model_met == 'train':
    feat_imp = pd.DataFrame({'feature':probas_cols, 
                             'logit_kind': [i.split('_')[0] for i in probas_cols],
                             'imp':lgbmodel.feature_importance()/sum(lgbmodel.feature_importance())})


    lgbmodel.save_model('lgb_classifier_81feats.txt', num_iteration=lgbmodel.best_iteration) 
    print("""Feature Importances by logits group: 
          """, feat_imp.groupby(['logit_kind'])['imp'].sum())
else:
    feat_imp = pd.DataFrame({'feature':probas_cols, 
                             'logit_kind': [i.split('_')[0] for i in probas_cols],
                             'imp':lgbmodel.feature_importances_/sum(lgbmodel.feature_importances_)})

    print("""Feature Importances by logits group: 
          """, feat_imp.groupby(['logit_kind'])['imp'].sum())
    
import shap
explainer = shap.TreeExplainer(lgbmodel)
shap_values = explainer.shap_values(X)
print("Time Elapsed: {:}.".format(format_time(time.time() - t0)))

In [None]:
for n, path in enumerate(['/kaggle/input/textphase1/', 
                          '/kaggle/input/testphase2/']):
    phase = n+1
    if phase==1:
        test_logits_path = test_logits_path_phase1
    else:
        test_logits_path = test_logits_path_phase2
    Preprocess.prepare_test(text_col, path, phase)
    X_test_phase1= Preprocess.X_test

    test_phase1 = preparelogits_df(test_logits_path,
                                df=X_test_phase1, val_labels=None, **kwargs)
    
    phase1_logits = lgbmodel.predict(test_phase1[probas_cols].values) 
    if model_met == 'train':
        predictions = np.argmax(phase1_logits, axis=1) 
    elif model_met == 'xgb':
        phase1_logits = bst.predict(xgb.DMatrix(test_phase1[probas_cols]), 
                                    ntree_limit=bst.best_ntree_limit) 
        predictions = phase1_logits
    else:
        predictions = phase1_logits
    X_test_phase1['prediction_model']= predictions
    X_test_phase1['Prdtypecode']=X_test_phase1['prediction_model'].map(Preprocess.dict_id_to_code)
    print(X_test_phase1['Prdtypecode'].value_counts())
    X_test_phase1=X_test_phase1.drop(['prediction_model','Title','Description'],axis=1)
    X_test_phase1.to_csv(f'y_test_task1_phase{phase}_pred_.tsv',sep='\t',index=False)