Prediccion utilizando Bert, en multiples folds.
La ultima capa densa de la red se exporta y se entrena, junto a los features, en un boosting tree.
Librerias probadas, primero xgboost, posteriormente lightgbm.

In [5]:
import os
import gc
import pandas as pd
import numpy as np
import random
import time
import warnings
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from keras import backend as K
import tensorflow.keras.layers as layers
from tensorflow.keras import callbacks
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout, Average
from tensorflow.keras.activations import sigmoid
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC
from tensorflow.keras.regularizers import l1, l2
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.callbacks import LearningRateScheduler
import lightgbm as lgb

from sklearn.model_selection import KFold

warnings.filterwarnings("ignore")

In [6]:
import pandas as pd
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
totaldata = pd.read_csv('totaldata.csv')

ids_with_target_error = [328,443,513,2619,3640,3900,4342,5781,6552,6554,6570,6701,6702,6729,6861,7226]
train.loc[train['id'].isin(ids_with_target_error),'target'] = 0

In [7]:
del totaldata['text']
del totaldata['keyword']
del totaldata['location']
f_train = totaldata.loc[totaldata['target']!=2]
f_test = totaldata.loc[totaldata['target']==2]

In [8]:
del f_train['target']
del f_test['target']
del totaldata

In [9]:
scaler = MinMaxScaler()
f_train = scaler.fit_transform(f_train)
f_test = scaler.fit_transform(f_test)

In [10]:
f_num = f_train.shape[1]

In [11]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)


def greed_encode(data, max_len) :
    input_ids = []
    attention_masks = []
################# Se seleccionan los primeros y ultimos tokens para regularizar los largos de los post
    for i in range(len(data.text)):
        
        encoded = tokenizer.encode_plus(data.text[i], add_special_tokens=True, max_length=max_len, pad_to_max_length=True)
         
        tok_len = sum(encoded['attention_mask'])
################# Se seleccionan los primeros y ultimos tokens para regularizar los largos de los post
################# Si el size es mayor al 80% se eliminan los tokens del medio
        if tok_len > max_len*.8:
            all_encode = tokenizer.encode_plus(data.text[i], add_special_tokens=True)
            all_ids = all_encode['input_ids']
            all_attention = all_encode['attention_mask']  
            max_len_half = int(max_len/2)
            input_ids.append(all_ids[:max_len_half] + all_ids[-max_len_half:])
            attention_masks.append(all_attention[:max_len_half] + all_attention[-max_len_half:])
################# Agiliza mucho el proceso pero con una perdida de precision leve
        else:  
            input_ids.append(encoded['input_ids'])
            attention_masks.append(encoded['attention_mask'])
    
    return np.array(input_ids),np.array(attention_masks)

################# se encodean los tweets, y se obtiene los ids
train_ids,train_masks = greed_encode(train,50)
test_ids,test_masks = greed_encode(test,50)
y_train = train.target



In [12]:
def create_model(bert_model, MAX_LEN=50):
    
    ####### features input
    features_input = layers.Input(shape=(f_num,), name="features")
    x = layers.Dense(features_num*2, activation='relu',name='dense_features')(features_input)
    features_output = layers.Dropout(0.5)(x)
    
    ####### text input
    ids = layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_ids')
    mask = layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name='attention_mask')

    last_hidden, _ = bert_model({'input_ids': ids, 'attention_mask': mask})
    last_hidden = Dropout(0.2)(last_hidden)
    ####### poolings y concatenate
    x_avg = layers.GlobalAveragePooling1D()(last_hidden)
    x_max = layers.GlobalMaxPooling1D()(last_hidden)
    x = layers.Concatenate()([x_avg, x_max])
    
    ####### sobre la salida de bert se hacen multiples layers densos
    ####### y se calcula el promedio
    samples = []    
    for n in range(5):
        sample_mask = layers.Dense(64, activation='relu', name = f'dense_{n}')
        sample = layers.Dropout(.5)(x)
        sample = sample_mask(sample)
        sample = layers.Dense(1, activation='sigmoid', name=f'sample_{n}')(sample)
        samples.append(sample)
    output = layers.Average(name='output')(samples)
    
    model = Model(inputs=[ids, mask], outputs=output)
    model.compile(Adam(lr=1e-5), loss = BinaryCrossentropy(label_smoothing=0.1), metrics=['accuracy'])
    return model

In [13]:
from transformers import TFBertModel
bert_model = TFBertModel.from_pretrained('bert-large-uncased')

model = create_model(bert_model)
model.summary()

In [14]:
def lgb_cv(X_train,X_test):

    folds = KFold(n_splits=5, shuffle=True)
    
    folds_predict = np.zeros(len(X_test))
    
#################################################################################
########https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html######
#################################################################################
    for i, (trn, val) in enumerate(folds.split(X_train, y_train)):
        ############### Subset para cada fold
        X_train_cv, y_train_cv = pd.DataFrame(X_train).loc[trn], pd.DataFrame(y_train).loc[trn]
        X_val, y_val = pd.DataFrame(X_train).loc[val], pd.DataFrame(y_train).loc[val]
        ############### Se agregan los features originales
        X_val = pd.concat([X_val,pd.DataFrame(f_train).loc[val]],axis=1)
        X_train_cv = pd.concat([X_train_cv,pd.DataFrame(f_train).loc[trn]],axis=1)
       ################ Se formatea la entrada del lgb
        train_data = lgb.Dataset(X_train_cv, label=y_train_cv)
        val_data = lgb.Dataset(X_val, label=y_val)
        ############### Entrenamiento de LightGBM
        clf = lgb.train({'boosting_type': 'gbdt'},
                        train_set = train_data,
                        valid_sets = [train_data, val_data]
        )
        df_test = pd.concat([pd.DataFrame(X_test),pd.DataFrame(f_test)], axis =1).values
        folds_predict += clf.predict(df_test)/folds.n_splits
        
    return folds_predict

In [15]:
############# Callbacks
def lr_sc(epoch):
         return 1.5e-5/(epoch + 1)
scheduler = LearningRateScheduler(lr_sc)
########### Se modifica el learning rate a medida que aumentan las iteraciones

es = callbacks.EarlyStopping(monitor='val_loss', min_delta=0.001, patience=2,
                                 mode='min', baseline=None, restore_best_weights=True)

########## Si no mejora en dos pasadas corta.

In [None]:
y_train = train.target.values
preds = pd.DataFrame()

folds = KFold(n_splits=4, shuffle=True)

for i, (trn, val) in enumerate(folds.split(train_ids)):
    
    bert_model = TFBertModel.from_pretrained('bert-large-uncased')
    model = create_model(bert_model)
    history = model.fit( 
        x = [train_ids[trn], train_masks[trn]],
        y = y_train[trn],
        validation_data=( 
            [train_ids[val], train_masks[val]],
            y_train[val]
        ),
        batch_size=16,
        epochs=3,
        callbacks=[scheduler, es]
    ) 
    ################# Se toma el hidden layer previo al output layer 
    abstract_model = Model(model.input, outputs=model.get_layer(f'dense_1').output)
    ################# Se agregan como features la salida de este layer
    ################# y se hace para cada fold
    X_train = abstract_model.predict([train_ids,train_masks])
    X_test = abstract_model.predict([test_ids,test_masks])
    
    preds['bert' + '_' + str(i)] = model.predict([test_ids,test_masks]).reshape(-1)
    preds['bert_hidden_lgb' + '_' + str(i)] = lgb_cv(X_train,X_test)
    
    del model
    K.clear_session()

In [44]:
val = np.round(preds.mean(axis=1)).astype(int)

In [None]:
##################################################################################
##################################################################################
##################################################################################

In [38]:
submission = pd.read_csv('sample_submission.csv')
submission['target'] = val
submission.head(2)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
5,12,1
6,21,0
7,22,0
8,27,0
9,29,0


In [39]:
submission.to_csv('submission-lgbm.csv', index=False)