## Dependencies

In [1]:
from tweet_utility_scripts import *
from transformers import TFDistilBertModel, DistilBertConfig
from tokenizers import BertWordPieceTokenizer
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Dropout, GlobalAveragePooling1D, GlobalMaxPooling1D, Concatenate, Subtract

# Load data

In [2]:
test = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv')

print('Test samples: %s' % len(test))
display(test.head())

Test samples: 3534


Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive


# Model parameters

In [3]:
MAX_LEN = 128
question_size = 3

base_path = '/kaggle/input/qa-transformers/distilbert/'
base_model_path = base_path + 'distilbert-base-uncased-distilled-squad-tf_model.h5'
config_path = base_path + 'distilbert-base-uncased-distilled-squad-config.json'
tokenizer_path = base_path + 'bert-large-uncased-vocab.txt'

input_base_path = '/kaggle/input/19-tweet-train-distilbert-base-uncased-sub-bce/'
model_path_list = glob.glob(input_base_path + '*.h5')
model_path_list.sort()
print('Models to predict:')
print(*model_path_list, sep = "\n")

Models to predict:
/kaggle/input/19-tweet-train-distilbert-base-uncased-sub-bce/model.h5


# Tokenizer

In [4]:
tokenizer = BertWordPieceTokenizer(tokenizer_path , lowercase=True)

# Pre process

In [5]:
test['text'].fillna('', inplace=True)
test["text"] = test["text"].apply(lambda x: x.lower())

x_test = get_data_test(test, tokenizer, MAX_LEN)

# Model

In [6]:
module_config = DistilBertConfig.from_pretrained(config_path, output_hidden_states=False)

def model_fn():
    input_ids = Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_ids')
    attention_mask = Input(shape=(MAX_LEN,), dtype=tf.int32, name='attention_mask')
    token_type_ids = Input(shape=(MAX_LEN,), dtype=tf.int32, name='token_type_ids')
    
    base_model = TFDistilBertModel.from_pretrained(base_model_path, config=module_config, name="base_model")
    sequence_output = base_model({'input_ids': input_ids, 'attention_mask': attention_mask, 'token_type_ids': token_type_ids})
    last_state = sequence_output[0]
    
    x = GlobalAveragePooling1D()(last_state)
    
    start = Dense(MAX_LEN, activation='sigmoid')(x)        
    end = Dense(MAX_LEN, activation='sigmoid')(x)
    
    y_start = Subtract(name='y_start')([start, end])
    y_end = Subtract(name='y_end')([end, start])
    
    model = Model(inputs=[input_ids, attention_mask, token_type_ids], outputs=[y_start, y_end])
    
    return model

# Make predictions

In [7]:
NUM_TEST_IMAGES = len(test)
test_start_preds = np.zeros((NUM_TEST_IMAGES, MAX_LEN))
test_end_preds = np.zeros((NUM_TEST_IMAGES, MAX_LEN))

for model_path in model_path_list:
    print(model_path)
    model = model_fn()
    model.load_weights(model_path)
    
    test_preds = model.predict(x_test)                    
    test_start_preds += test_preds[0] / len(model_path_list)
    test_end_preds += test_preds[1] / len(model_path_list)

/kaggle/input/19-tweet-train-distilbert-base-uncased-sub-bce/model.h5


# Post process

In [8]:
test['start'] = test_start_preds.argmax(axis=-1)
test['end'] = test_end_preds.argmax(axis=-1)

test['text_len'] = test['text'].apply(lambda x : len(x))
test["end"].clip(0, test["text_len"], inplace=True)
test["start"].clip(0, test["end"], inplace=True)

test['selected_text'] = test.apply(lambda x: decode(x['start'], x['end'], x['text'], question_size, tokenizer), axis=1)
test["selected_text"].fillna('', inplace=True)

# Visualize predictions

In [9]:
display(test.head(10))

Unnamed: 0,textID,text,sentiment,start,end,text_len,selected_text
0,f87dea47db,last session of the day http://twitpic.com/67ezh,neutral,4,12,49,last session of the day http://
1,96d74cb729,shanghai is also really exciting (precisely -...,positive,4,12,103,shanghai is also really exciting (precisely --
2,eee518ae67,"recession hit veronique branquinho, she has to...",negative,4,12,78,"recession hit veronique branquinho,"
3,01082688c6,happy bday!,positive,4,12,12,happy bday!
4,33987a8ee5,http://twitpic.com/4w75p - i like it!!,positive,4,12,39,http://twitpic.
5,726e501993,that`s great!! weee!! visitors!,positive,4,12,32,that`s great!! weee!
6,261932614e,i think everyone hates me on here lol,negative,4,12,39,i think everyone hates me on here lol
7,afa11da83f,"soooooo wish i could, but im in school and my...",negative,4,12,73,"soooooo wish i could, but im"
8,e64208b4ef,and within a short time of the last clue all ...,neutral,4,12,53,and within a short time of the last clue
9,37bcad24ca,what did you get? my day is alright.. haven`...,neutral,4,12,104,what did you get? my day is alright


# Test set predictions

In [10]:
submission = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/sample_submission.csv')
submission['selected_text'] = test["selected_text"]
submission.to_csv('submission.csv', index=False)
submission.head(10)

Unnamed: 0,textID,selected_text
0,f87dea47db,last session of the day http://
1,96d74cb729,shanghai is also really exciting (precisely --
2,eee518ae67,"recession hit veronique branquinho,"
3,01082688c6,happy bday!
4,33987a8ee5,http://twitpic.
5,726e501993,that`s great!! weee!
6,261932614e,i think everyone hates me on here lol
7,afa11da83f,"soooooo wish i could, but im"
8,e64208b4ef,and within a short time of the last clue
9,37bcad24ca,what did you get? my day is alright
