## Dependencies

In [1]:
import glob
import numpy as np
import pandas as pd
from transformers import TFDistilBertModel
from tokenizers import BertWordPieceTokenizer
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Dropout, GlobalAveragePooling1D, Concatenate

In [2]:
# Auxiliary functions
# Transformer inputs
def preprocess_test(text, context, tokenizer, max_seq_len):
    context_encoded = tokenizer.encode(context)
    context_encoded = context_encoded.ids[1:-1]
    
    encoded = tokenizer.encode(text)
    encoded.pad(max_seq_len)
    encoded.truncate(max_seq_len)
    input_ids = encoded.ids
    attention_mask = encoded.attention_mask
    token_type_ids = ([0] * 3) + ([1] * (max_seq_len - 3))
    
    input_ids = [101] + context_encoded + [102] + input_ids
    # update input ids and attentions masks size
    input_ids = input_ids[:-3]
    attention_mask = [1] * 3 + attention_mask[:-3]
    
    x = [np.asarray(input_ids, dtype=np.int32), 
         np.asarray(attention_mask, dtype=np.int32), 
         np.asarray(token_type_ids, dtype=np.int32)]
    
    return x

def get_data_test(df, tokenizer, MAX_LEN):
    x_input_ids = []
    x_attention_masks = []
    x_token_type_ids = []
    for row in df.itertuples(): 
        x = preprocess_test(getattr(row, "text"), getattr(row, "sentiment"), tokenizer, MAX_LEN)
        x_input_ids.append(x[0])
        x_attention_masks.append(x[1])
        x_token_type_ids.append(x[2])

    x_data = [np.asarray(x_input_ids), np.asarray(x_attention_masks), np.asarray(x_token_type_ids)]
    return x_data

def decode(pred_start, pred_end, text, tokenizer):
    offset = tokenizer.encode(text).offsets
    
    if pred_end >= len(offset):
        pred_end = len(offset)-1
        
    decoded_text = ""
    for i in range(pred_start, pred_end+1):
        decoded_text += text[offset[i][0]:offset[i][1]]
        if (i+1) < len(offset) and offset[i][1] < offset[i+1][0]:
            decoded_text += " "
    return decoded_text

# Load data

In [3]:
test = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv')

print('Test samples: %s' % len(test))
display(test.head())

Test samples: 3535


Unnamed: 0,textID,text,sentiment
0,11aa4945ff,http://twitpic.com/67swx - i wish i was calli...,positive
1,fd1db57dc0,i'm done.haha. HOUSE MD marathon ulet,positive
2,2524332d66,I'm concerned for that family,positive
3,0fb19285b2,HEY GUYS IT'S WORKING NO NEED TO WORRY. i have...,positive
4,e6c9e5e3ab,26th February,neutral


# Model parameters

In [4]:
MAX_LEN = 128

base_path = '/kaggle/input/qa-transformers/distilbert/'
base_model_path = base_path + 'distilbert-base-uncased-distilled-squad-tf_model.h5'
config_path = base_path + 'distilbert-base-uncased-distilled-squad-config.json'

input_base_path = '/kaggle/input/6-tweet-train-distilbert-lower-bce-v2/'
tokenizer_path = input_base_path + 'vocab.txt'
model_path_list = glob.glob(input_base_path + '*.h5')
model_path_list.sort()
print('Models to predict:')
print(*model_path_list, sep = "\n")

Models to predict:
/kaggle/input/6-tweet-train-distilbert-lower-bce-v2/model.h5


# Tokenizer

In [5]:
tokenizer = BertWordPieceTokenizer(tokenizer_path , lowercase=True)

# Pre process

In [6]:
test['text'].fillna('', inplace=True)
test["text"] = test["text"].apply(lambda x: x.lower())

x_test = get_data_test(test, tokenizer, MAX_LEN)

# Model

In [7]:
def model_fn():
    input_ids = Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_ids')
    attention_mask = Input(shape=(MAX_LEN,), dtype=tf.int32, name='attention_mask')
    token_type_ids = Input(shape=(MAX_LEN,), dtype=tf.int32, name='token_type_ids')
    
    base_model = TFDistilBertModel.from_pretrained(base_model_path, config=config_path, name="base_model")
    sequence_output = base_model({'input_ids': input_ids, 'attention_mask': attention_mask, 'token_type_ids': token_type_ids})
    last_state = sequence_output[0]
    
    x = GlobalAveragePooling1D()(last_state)
    
    y_start = Dense(MAX_LEN, activation='sigmoid', name='y_start')(x)
    y_end = Dense(MAX_LEN, activation='sigmoid', name='y_end')(x)
    
    model = Model(inputs=[input_ids, attention_mask, token_type_ids], outputs=[y_start, y_end])
    
    return model

# Make predictions

In [8]:
NUM_TEST_IMAGES = len(test)
test_start_preds = np.zeros((NUM_TEST_IMAGES, MAX_LEN))
test_end_preds = np.zeros((NUM_TEST_IMAGES, MAX_LEN))

for model_path in model_path_list:
    print(model_path)
    model = model_fn()
    model.load_weights(model_path)
    
    test_preds = model.predict(x_test)                    
    test_start_preds += test_preds[0] / len(model_path_list)
    test_end_preds += test_preds[1] / len(model_path_list)

/kaggle/input/6-tweet-train-distilbert-lower-bce-v2/model.h5


# Post process

In [9]:
test['start'] = test_start_preds.argmax(axis=-1)
test['end'] = test_end_preds.argmax(axis=-1)
test['selected_text'] = test.apply(lambda x: decode(x['start'], x['end'], x['text'], tokenizer), axis=1)

# Test set predictions

In [10]:
submission = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/sample_submission.csv')
submission['selected_text'] = test["selected_text"]
submission.to_csv('submission.csv', index=False)
submission.head(10)

Unnamed: 0,textID,selected_text
0,11aa4945ff,wish
1,fd1db57dc0,i'm done.haha.
2,2524332d66,i'm concerned
3,0fb19285b2,hey guys it's working no need to worry.
4,e6c9e5e3ab,26th february
5,311d2b185b,breaky
6,95dfefd4e7,2 party 4
7,739f17cfe1,oh yeah the camera clipping problems with void...
8,c6322a85c2,_layne hmm.. what's ur fav movie?? tv shows??
9,b4401d6b4d,"salt and vinegar, cheese and onion make your b..."
