## Dependencies

In [1]:
import json, glob
from tweet_utility_scripts import *
from tweet_utility_preprocess_roberta_scripts import *
from transformers import TFRobertaModel, RobertaConfig
from tokenizers import ByteLevelBPETokenizer
from tensorflow.keras import layers
from tensorflow.keras.models import Model

# Load data

In [2]:
test = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv')

print('Test samples: %s' % len(test))
display(test.head())

Test samples: 3534


Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive


# Model parameters

In [3]:
input_base_path = '/kaggle/input/51-tweet-train-3fold-roberta-base-dropout-np/'
with open(input_base_path + 'config.json') as json_file:
    config = json.load(json_file)

config

{'MAX_LEN': 96,
 'BATCH_SIZE': 16,
 'EPOCHS': 5,
 'LEARNING_RATE': 5e-05,
 'ES_PATIENCE': 1,
 'question_size': 4,
 'N_FOLDS': 3,
 'base_model_path': '/kaggle/input/qa-transformers/roberta/roberta-base-tf_model.h5',
 'config_path': '/kaggle/input/qa-transformers/roberta/roberta-base-config.json'}

In [4]:
vocab_path = input_base_path + 'vocab.json'
merges_path = input_base_path + 'merges.txt'
base_path = '/kaggle/input/qa-transformers/roberta/'
model_path_list = glob.glob(input_base_path + '*.h5')
model_path_list.sort()
print('Models to predict:')
print(*model_path_list, sep = "\n")

Models to predict:
/kaggle/input/51-tweet-train-3fold-roberta-base-dropout-np/model_fold_1.h5
/kaggle/input/51-tweet-train-3fold-roberta-base-dropout-np/model_fold_2.h5
/kaggle/input/51-tweet-train-3fold-roberta-base-dropout-np/model_fold_3.h5


# Tokenizer

In [5]:
tokenizer = ByteLevelBPETokenizer(vocab_file=vocab_path, merges_file=merges_path, lowercase=True, add_prefix_space=True)

# Pre process

In [6]:
test['text'].fillna('', inplace=True)
test["text"] = test["text"].apply(lambda x: x.lower())
test["text"] = test["text"].apply(lambda x: x.strip())

x_test = get_data_test(test, tokenizer, config['MAX_LEN'], preprocess_fn=preprocess_roberta_test)

# Model

In [7]:
module_config = RobertaConfig.from_pretrained(config['config_path'], output_hidden_states=False)

def model_fn(MAX_LEN):
    input_ids = layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_ids')
    attention_mask = layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name='attention_mask')
    token_type_ids = layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name='token_type_ids')
    
    base_model = TFRobertaModel.from_pretrained(config['base_model_path'], config=module_config, name="base_model")
    sequence_output = base_model({'input_ids': input_ids, 'attention_mask': attention_mask})
    last_state = sequence_output[0]
    
    x_start = layers.Conv1D(1, 1)(last_state)
    x_start = layers.Flatten()(x_start)
    y_start = layers.Dense(MAX_LEN, activation='softmax', name='y_start')(x_start)   

    x_end = layers.Conv1D(1, 1)(last_state)
    x_end = layers.Flatten()(x_end)
    y_end = layers.Dense(MAX_LEN, activation='softmax', name='y_end')(x_end) 
    
    model = Model(inputs=[input_ids, attention_mask], outputs=[y_start, y_end])
    
    return model

# Make predictions

In [8]:
NUM_TEST_IMAGES = len(test)
test_start_preds = np.zeros((NUM_TEST_IMAGES, config['MAX_LEN']))
test_end_preds = np.zeros((NUM_TEST_IMAGES, config['MAX_LEN']))

for model_path in model_path_list:
    print(model_path)
    model = model_fn(config['MAX_LEN'])
    model.load_weights(model_path)
    
    test_preds = model.predict(x_test)  
    test_start_preds += test_preds[0] / len(model_path_list)
    test_end_preds += test_preds[1] / len(model_path_list)

/kaggle/input/51-tweet-train-3fold-roberta-base-dropout-np/model_fold_1.h5
/kaggle/input/51-tweet-train-3fold-roberta-base-dropout-np/model_fold_2.h5
/kaggle/input/51-tweet-train-3fold-roberta-base-dropout-np/model_fold_3.h5


# Post process

In [9]:
test['start'] = test_start_preds.argmax(axis=-1)
test['end'] = test_end_preds.argmax(axis=-1)

test['text_len'] = test['text'].apply(lambda x : len(x))
test['text_wordCnt'] = test['text'].apply(lambda x : len(x.split(' ')))
test["end"].clip(0, test["text_len"], inplace=True)
test["start"].clip(0, test["end"], inplace=True)

test['selected_text'] = test.apply(lambda x: decode(x['start'], x['end'], x['text'], config['question_size'], tokenizer), axis=1)
test["selected_text"].fillna('', inplace=True)

# Visualize predictions

In [10]:
display(test.head(10))

Unnamed: 0,textID,text,sentiment,start,end,text_len,text_wordCnt,selected_text
0,f87dea47db,last session of the day http://twitpic.com/67ezh,neutral,4,8,49,7,last session of the day
1,96d74cb729,shanghai is also really exciting (precisely --...,positive,10,10,102,17,exciting
2,eee518ae67,"recession hit veronique branquinho, she has to...",negative,21,23,78,13,a shame!
3,01082688c6,happy bday!,positive,4,7,11,2,happy bday!
4,33987a8ee5,http://twitpic.com/4w75p - i like it!!,positive,18,20,38,5,like it!!
5,726e501993,that`s great!! weee!! visitors!,positive,4,8,31,4,that`s great!!
6,261932614e,i think everyone hates me on here lol,negative,7,7,39,10,hates
7,afa11da83f,"soooooo wish i could, but im in school and mys...",negative,20,20,72,13,blocked
8,e64208b4ef,and within a short time of the last clue all o...,neutral,4,15,52,12,and within a short time of the last clue all o...
9,37bcad24ca,what did you get? my day is alright.. haven`t...,neutral,4,29,103,19,what did you get? my day is alright.. haven`t...


# Test set predictions

In [11]:
submission = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/sample_submission.csv')
submission['selected_text'] = test["selected_text"]
submission.to_csv('submission.csv', index=False)
submission.head(10)

Unnamed: 0,textID,selected_text
0,f87dea47db,last session of the day
1,96d74cb729,exciting
2,eee518ae67,a shame!
3,01082688c6,happy bday!
4,33987a8ee5,like it!!
5,726e501993,that`s great!!
6,261932614e,hates
7,afa11da83f,blocked
8,e64208b4ef,and within a short time of the last clue all o...
9,37bcad24ca,what did you get? my day is alright.. haven`t...
