## Dependencies

In [1]:
from tweet_utility_scripts import *
from transformers import TFDistilBertModel
from tokenizers import BertWordPieceTokenizer
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Dropout, GlobalAveragePooling1D, GlobalMaxPooling1D, Concatenate

# Load data

In [2]:
test = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv')

print('Test samples: %s' % len(test))
display(test.head())

Test samples: 3535


Unnamed: 0,textID,text,sentiment
0,11aa4945ff,http://twitpic.com/67swx - i wish i was calli...,positive
1,fd1db57dc0,i'm done.haha. HOUSE MD marathon ulet,positive
2,2524332d66,I'm concerned for that family,positive
3,0fb19285b2,HEY GUYS IT'S WORKING NO NEED TO WORRY. i have...,positive
4,e6c9e5e3ab,26th February,neutral


# Model parameters

In [3]:
MAX_LEN = 128

base_path = '/kaggle/input/qa-transformers/distilbert/'
base_model_path = base_path + 'distilbert-base-uncased-distilled-squad-tf_model.h5'
config_path = base_path + 'distilbert-base-uncased-distilled-squad-config.json'

input_base_path = '/kaggle/input/8-tweet-train-distilbert-lower-avg-max/'
tokenizer_path = base_path + 'bert-large-uncased-vocab.txt'
model_path_list = glob.glob(input_base_path + '*.h5')
model_path_list.sort()
print('Models to predict:')
print(*model_path_list, sep = "\n")

Models to predict:
/kaggle/input/8-tweet-train-distilbert-lower-avg-max/model.h5


# Tokenizer

In [4]:
tokenizer = BertWordPieceTokenizer(tokenizer_path , lowercase=True)

# Pre process

In [5]:
test['text'].fillna('', inplace=True)
test["text"] = test["text"].apply(lambda x: x.lower())

x_test = get_data_test(test, tokenizer, MAX_LEN)

# Model

In [6]:
def model_fn():
    input_ids = Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_ids')
    attention_mask = Input(shape=(MAX_LEN,), dtype=tf.int32, name='attention_mask')
    token_type_ids = Input(shape=(MAX_LEN,), dtype=tf.int32, name='token_type_ids')
    
    base_model = TFDistilBertModel.from_pretrained(base_model_path, config=config_path, name="base_model")
    sequence_output = base_model({'input_ids': input_ids, 'attention_mask': attention_mask, 'token_type_ids': token_type_ids})
    last_state = sequence_output[0]
    
    avg_p = GlobalAveragePooling1D()(last_state)
    max_p = GlobalMaxPooling1D()(last_state)
    
    x = Concatenate()([avg_p, max_p])
    
    y_start = Dense(MAX_LEN, activation='softmax', name='y_start')(x)
    y_end = Dense(MAX_LEN, activation='softmax', name='y_end')(x)
    
    model = Model(inputs=[input_ids, attention_mask, token_type_ids], outputs=[y_start, y_end])
    
    return model

# Make predictions

In [7]:
NUM_TEST_IMAGES = len(test)
test_start_preds = np.zeros((NUM_TEST_IMAGES, MAX_LEN))
test_end_preds = np.zeros((NUM_TEST_IMAGES, MAX_LEN))

for model_path in model_path_list:
    print(model_path)
    model = model_fn()
    model.load_weights(model_path)
    
    test_preds = model.predict(x_test)                    
    test_start_preds += test_preds[0] / len(model_path_list)
    test_end_preds += test_preds[1] / len(model_path_list)

/kaggle/input/8-tweet-train-distilbert-lower-avg-max/model.h5


# Post process

In [8]:
test['start'] = test_start_preds.argmax(axis=-1)
test['end'] = test_end_preds.argmax(axis=-1)
test['selected_text'] = test.apply(lambda x: decode(x['start'], x['end'], x['text'], tokenizer), axis=1)

# Test set predictions

In [9]:
submission = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/sample_submission.csv')
submission['selected_text'] = test["selected_text"]
submission.to_csv('submission.csv', index=False)
submission.head(10)

Unnamed: 0,textID,selected_text
0,11aa4945ff,http://twitpic.com/67swx
1,fd1db57dc0,i'm done.haha. house md marathon ulet
2,2524332d66,i'm concerned for
3,0fb19285b2,hey guys it's working no need to worry. i have...
4,e6c9e5e3ab,26th february
5,311d2b185b,breaks my achy breaky heart they split ways in
6,95dfefd4e7,well off 2 bed...cant wait 2 party 4 mother
7,739f17cfe1,oh yeah the camera clipping problems with void...
8,c6322a85c2,_layne hmm.. what's ur fav movie??
9,b4401d6b4d,"salt and vinegar, cheese and"
