#  Twitter Airline RoBERTa
This notebook referred to the following noteboook.<br/>
https://www.kaggle.com/code/junjitakeshima/ell-simple-roberta-starter-eng

# 1. Read Data

In [1]:
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import transformers
from transformers import RobertaTokenizer, TFRobertaModel
transformers.logging.set_verbosity_error()
import re
pd.set_option("display.max_columns", None)

2022-10-22 09:26:43.796441: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-22 09:26:43.797525: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-22 09:26:43.798216: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-22 09:26:43.800337: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [2]:
data0 = pd.read_csv('../input/twitter-airline-sentiment/Tweets.csv')
display(data0[0:3].T)

Unnamed: 0,0,1,2
tweet_id,570306133677760513,570301130888122368,570301083672813571
airline_sentiment,neutral,positive,neutral
airline_sentiment_confidence,1.0,0.3486,0.6837
negativereason,,,
negativereason_confidence,,0.0,
airline,Virgin America,Virgin America,Virgin America
airline_sentiment_gold,,,
name,cairdin,jnardino,yvonnalynn
negativereason_gold,,,
retweet_count,0,0,0


In [3]:
data=data0[['airline_sentiment','text']]
data=data.dropna()
data['airline_sentiment'].value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

In [4]:
Name0=data['airline_sentiment'].unique().tolist()
Name=sorted(Name0)
N=list(range(len(Name)))
normal_mapping=dict(zip(Name,N)) 
reverse_mapping=dict(zip(N,Name)) 
print(normal_mapping)
data['airline_sentiment']=data['airline_sentiment'].map(normal_mapping)

{'negative': 0, 'neutral': 1, 'positive': 2}


In [5]:
n=len(data)
N=list(range(n))
random.shuffle(N)

In [6]:
train_df = data.iloc[N[0:(n//10)*3]].reset_index(drop=True)
test_df = data.iloc[N[(n//10)*3:(n//10)*4]].reset_index(drop=True)

In [7]:
train_df["text"] = train_df["text"].replace(re.compile(r'[\n\r\t]'), ' ', regex=True)
test_df["text"] = test_df["text"].replace(re.compile(r'[\n\r\t]'), ' ', regex=True)

# 2. Tokenize and create data

In [8]:
tokenizer = RobertaTokenizer.from_pretrained("../input/roberta-base/")
max_len = 128

In [9]:
def create_data(text):
    
    encoded = tokenizer.batch_encode_plus(
        text,
        add_special_tokens = True,
        max_length= max_len,
        padding='max_length',
        truncation=True,
        return_attention_mask=True)

    input_ids       = np.array(encoded["input_ids"], dtype="int32")
    attention_masks = np.array(encoded["attention_mask"], dtype="int32")

    return {"input_ids": input_ids, "attention_masks": attention_masks}

In [10]:
train_data   = create_data(train_df['text'])

In [11]:
train = []
train.append(train_df["airline_sentiment"].to_list())


# 3. Build Model

In [12]:
def build_model():
    
    model_ids  = Input(shape=(max_len, ), dtype = tf.int32)
    model_mask = Input(shape=(max_len, ), dtype = tf.int32)
    
    roberta_model = TFRobertaModel.from_pretrained("../input/roberta-base/")
    
    x = roberta_model(input_ids = model_ids, 
                      attention_mask = model_mask)       
    x = tf.keras.layers.GlobalAveragePooling1D()(x.last_hidden_state)    
    outputs = Dense(len(Name))(x) ####
    
    model = tf.keras.Model(inputs = [model_ids, model_mask], outputs = outputs)
    
    model.compile(
        optimizer = tf.keras.optimizers.Adam(),
        loss = "mse",
        metrics=["mse"])
    return model

In [13]:
def scheduler(epoch):
    learning_rate = 2e-5
    if epoch == 0:
        return learning_rate * 0.05
    else:
        return learning_rate * (0.9**epoch)
    
callback_lr = tf.keras.callbacks.LearningRateScheduler(scheduler)

In [14]:
def get_model(train_col) :
    
    model = build_model()
    model.fit((np.array(train_data['input_ids']),
               np.array(train_data['attention_masks'])),
        np.array(train_col).ravel(), 
        epochs = 10,
        shuffle=True,
        callbacks = [EarlyStopping(monitor='val_mse', patience=3, restore_best_weights=True), 
                     ModelCheckpoint('roberta_uspppm.h5', monitor='val_mse', 
                                     save_best_only=True, save_weights_only=True), 
                     callback_lr],                     
        batch_size = 16,
        validation_split=0.2 )
    
    return model

# 4. Get Model


In [15]:
%%time

target_cols = ["airline_sentiment", ]
models = {}

for i, col in enumerate(target_cols) :
            
    print (f"-------------- Model for {col} ---------------")
    model = get_model(train[i])
    models[i] = model

-------------- Model for airline_sentiment ---------------


2022-10-22 09:27:03.836072: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
CPU times: user 5min 58s, sys: 34.3 s, total: 6min 33s
Wall time: 7min 57s


# 5. Submission

In [16]:
test_data = create_data(test_df['text'])

In [17]:
preds = []
for i in range(1) :
    pred = models[i].predict((np.array(test_data['input_ids']),
                              np.array(test_data['attention_masks'])))
    preds.append(np.argmax(pred,axis=1))
    
trues = test_df["airline_sentiment"]

In [18]:
from sklearn.metrics import classification_report
print(classification_report(trues,preds[0],target_names=Name,digits=4))

              precision    recall  f1-score   support

    negative     0.5684    0.3802    0.4556       918
     neutral     0.1628    0.2964    0.2102       307
    positive     0.1581    0.1925    0.1736       239

    accuracy                         0.3320      1464
   macro avg     0.2964    0.2897    0.2798      1464
weighted avg     0.4164    0.3320    0.3581      1464

