## Dependencies

In [1]:
import json, glob
from tweet_utility_scripts import *
from tweet_utility_preprocess_roberta_scripts_aux import *
from transformers import TFRobertaModel, RobertaConfig
from tokenizers import ByteLevelBPETokenizer
from tensorflow.keras import layers
from tensorflow.keras.models import Model

# Load data

In [2]:
test = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv')

print('Test samples: %s' % len(test))
display(test.head())

Test samples: 3534


Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive


# Model parameters

In [3]:
input_base_path = '/kaggle/input/229-tweet-train-5fold-roberta-reference-hf-exp2/'
with open(input_base_path + 'config.json') as json_file:
    config = json.load(json_file)

config

{'MAX_LEN': 64,
 'BATCH_SIZE': 32,
 'EPOCHS': 7,
 'LEARNING_RATE': 3e-05,
 'ES_PATIENCE': 2,
 'N_FOLDS': 5,
 'question_size': 4,
 'base_model_path': '/kaggle/input/qa-transformers/roberta/roberta-base-tf_model.h5',
 'config_path': '/kaggle/input/qa-transformers/roberta/roberta-base-config.json'}

In [4]:
vocab_path = input_base_path + 'vocab.json'
merges_path = input_base_path + 'merges.txt'
base_path = '/kaggle/input/qa-transformers/roberta/'

# vocab_path = base_path + 'roberta-base-vocab.json'
# merges_path = base_path + 'roberta-base-merges.txt'
config['base_model_path'] = base_path + 'roberta-base-tf_model.h5'
config['config_path'] = base_path + 'roberta-base-config.json'

model_path_list = glob.glob(input_base_path + 'model' + '*.h5')
model_path_list.sort()

print('Models to predict:')
print(*model_path_list, sep = '\n')

Models to predict:
/kaggle/input/229-tweet-train-5fold-roberta-reference-hf-exp2/model_fold_1.h5
/kaggle/input/229-tweet-train-5fold-roberta-reference-hf-exp2/model_fold_2.h5
/kaggle/input/229-tweet-train-5fold-roberta-reference-hf-exp2/model_fold_3.h5
/kaggle/input/229-tweet-train-5fold-roberta-reference-hf-exp2/model_fold_4.h5
/kaggle/input/229-tweet-train-5fold-roberta-reference-hf-exp2/model_fold_5.h5


# Tokenizer

In [5]:
tokenizer = ByteLevelBPETokenizer(vocab_file=vocab_path, merges_file=merges_path, 
                                  lowercase=True, add_prefix_space=True)

# Pre process

In [6]:
test['text'].fillna('', inplace=True)
test['text'] = test['text'].apply(lambda x: x.lower())
test['text'] = test['text'].apply(lambda x: x.strip())

x_test, x_test_aux, x_test_aux_2 = get_data_test(test, tokenizer, config['MAX_LEN'], preprocess_fn=preprocess_roberta_test)

# Model

In [7]:
module_config = RobertaConfig.from_pretrained(config['config_path'], output_hidden_states=False)

def model_fn(MAX_LEN):
    input_ids = layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_ids')
    attention_mask = layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name='attention_mask')
    
    base_model = TFRobertaModel.from_pretrained(config['base_model_path'], config=module_config, name="base_model")
    last_hidden_state, _  = base_model({'input_ids': input_ids, 'attention_mask': attention_mask})
    
    logits = layers.Dense(2, name="qa_outputs")(last_hidden_state)
    
    start_logits, end_logits = tf.split(logits, 2, axis=-1)
    start_logits = tf.squeeze(start_logits, axis=-1)
    end_logits = tf.squeeze(end_logits, axis=-1)

    model = Model(inputs=[input_ids, attention_mask], outputs=[start_logits, end_logits])
    
    return model

# Make predictions

In [8]:
NUM_TEST_IMAGES = len(test)
test_start_preds = np.zeros((NUM_TEST_IMAGES, config['MAX_LEN']))
test_end_preds = np.zeros((NUM_TEST_IMAGES, config['MAX_LEN']))

for model_path in model_path_list:
    print(model_path)
    model = model_fn(config['MAX_LEN'])
    model.load_weights(model_path)
    
    test_preds = model.predict(get_test_dataset(x_test, config['BATCH_SIZE']))  
    test_start_preds += test_preds[0]
    test_end_preds += test_preds[1]

/kaggle/input/229-tweet-train-5fold-roberta-reference-hf-exp2/model_fold_1.h5
/kaggle/input/229-tweet-train-5fold-roberta-reference-hf-exp2/model_fold_2.h5
/kaggle/input/229-tweet-train-5fold-roberta-reference-hf-exp2/model_fold_3.h5
/kaggle/input/229-tweet-train-5fold-roberta-reference-hf-exp2/model_fold_4.h5
/kaggle/input/229-tweet-train-5fold-roberta-reference-hf-exp2/model_fold_5.h5


# Post process

In [9]:
test['start'] = test_start_preds.argmax(axis=-1)
test['end'] = test_end_preds.argmax(axis=-1)

test['selected_text'] = test.apply(lambda x: decode(x['start'], x['end'], x['text'], config['question_size'], tokenizer), axis=1)

# Post-process

test["selected_text"] = test.apply(lambda x: ' '.join([word for word in x['selected_text'].split() if word in x['text'].split()]), axis=1)
test['selected_text'] = test.apply(lambda x: x['text'] if (x['selected_text'] == '') else x['selected_text'], axis=1)
test['selected_text'].fillna(test['text'], inplace=True)

# Visualize predictions

In [10]:
test['text_len'] = test['text'].apply(lambda x : len(x))
test['label_len'] = test['selected_text'].apply(lambda x : len(x))
test['text_wordCnt'] = test['text'].apply(lambda x : len(x.split(' ')))
test['label_wordCnt'] = test['selected_text'].apply(lambda x : len(x.split(' ')))
test['text_tokenCnt'] = test['text'].apply(lambda x : len(tokenizer.encode(x).ids))
test['label_tokenCnt'] = test['selected_text'].apply(lambda x : len(tokenizer.encode(x).ids))
test['jaccard'] = test.apply(lambda x: jaccard(x['text'], x['selected_text']), axis=1)

display(test.head(10))
display(test.describe())

Unnamed: 0,textID,text,sentiment,start,end,selected_text,text_len,label_len,text_wordCnt,label_wordCnt,text_tokenCnt,label_tokenCnt,jaccard
0,f87dea47db,last session of the day http://twitpic.com/67ezh,neutral,4,8,last session of the day,49,23,7,5,17,5,0.833333
1,96d74cb729,shanghai is also really exciting (precisely --...,positive,10,10,exciting,102,8,17,1,33,1,0.066667
2,eee518ae67,"recession hit veronique branquinho, she has to...",negative,20,23,such a shame!,78,13,13,3,20,4,0.230769
3,01082688c6,happy bday!,positive,4,7,happy bday!,11,11,2,2,4,4,1.0
4,33987a8ee5,http://twitpic.com/4w75p - i like it!!,positive,17,20,i like it!!,38,11,5,3,17,4,0.6
5,726e501993,that`s great!! weee!! visitors!,positive,4,8,that`s great!!,31,14,4,2,10,5,0.5
6,261932614e,i think everyone hates me on here lol,negative,7,7,hates,39,5,10,1,10,1,0.125
7,afa11da83f,"soooooo wish i could, but im in school and mys...",negative,20,20,blocked,72,7,13,1,17,1,0.076923
8,e64208b4ef,and within a short time of the last clue all o...,neutral,4,15,and within a short time of the last clue all o...,52,52,12,12,12,12,1.0
9,37bcad24ca,what did you get? my day is alright.. haven`t...,neutral,4,29,what did you get? my day is alright.. haven`t ...,103,102,19,18,26,25,1.0


Unnamed: 0,start,end,text_len,label_len,text_wordCnt,label_wordCnt,text_tokenCnt,label_tokenCnt,jaccard
count,3534.0,3534.0,3534.0,3534.0,3534.0,3534.0,3534.0,3534.0,3534.0
mean,8.334748,16.798529,67.326259,35.600736,13.185059,6.863328,18.19751,9.437748,0.57826
std,7.077369,9.896443,35.609555,36.611955,7.113988,7.112038,9.808701,9.883525,0.413383
min,4.0,4.0,3.0,2.0,1.0,1.0,1.0,1.0,0.033333
25%,4.0,9.0,38.0,7.0,7.0,1.0,10.0,2.0,0.142857
50%,4.0,15.0,62.0,19.0,12.0,4.0,17.0,5.0,0.585784
75%,11.0,23.0,96.0,53.0,19.0,10.0,26.0,14.0,1.0
max,44.0,57.0,142.0,137.0,35.0,32.0,68.0,68.0,1.0


# Test set predictions

In [11]:
submission = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/sample_submission.csv')
submission['selected_text'] = test['selected_text']
submission.to_csv('submission.csv', index=False)
submission.head(10)

Unnamed: 0,textID,selected_text
0,f87dea47db,last session of the day
1,96d74cb729,exciting
2,eee518ae67,such a shame!
3,01082688c6,happy bday!
4,33987a8ee5,i like it!!
5,726e501993,that`s great!!
6,261932614e,hates
7,afa11da83f,blocked
8,e64208b4ef,and within a short time of the last clue all o...
9,37bcad24ca,what did you get? my day is alright.. haven`t ...
