### Firstly, lets install transformer package

In [1]:
!pip install transformers



### Import required libraries





In [2]:
import tensorflow as tf
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import re
!pip install emoji --quiet
import emoji
!pip install contractions --quiet
import contractions
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import unicodedata


from transformers import RobertaTokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import RobertaForSequenceClassification, AdamW, RobertaConfig
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup
import time
import datetime
import random
import plotly.express as px
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Check out for Gpu

In [3]:
if torch.cuda.is_available():    
  device = torch.device("cuda")
  print('The GPU we use is:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

The GPU we use is: Tesla P100-PCIE-16GB


#Data and preprocessing

In [4]:
data= pd.read_csv("/content/drive/My Drive/COVID19Tweet-master/TRAIN_WNUT.csv")                        #train dataset
valid=pd.read_csv("/content/drive/My Drive/COVID19Tweet-master/VALID_WNUT.csv")                        #valid dataset

In [5]:
#combining both train and valid dataset
total_set= pd.concat([data,valid],ignore_index=True)
mix = total_set.iloc[:,1:]
le= LabelEncoder()
mix['Label']=le.fit_transform(mix['Label'])
mix

Unnamed: 0,Id,Text,Label
0,1241490299215634434,Official death toll from #covid19 in the Unite...,0
1,1245916400981381130,"Dearest Mr. President @USER 1,169 coronavirus ...",0
2,1241132432402849793,Latest Updates March 20 ⚠️5274 new cases and 3...,0
3,1236107253666607104,真把公主不当干部 BREAKING: 21 people on Grand Princess...,0
4,1239673817552879619,OKLAHOMA CITY — The State Department of Educat...,1
...,...,...,...
7995,1245955124222099456,Coronavirus took hold in UK earlier than thoug...,1
7996,1241768801210904576,I talked with a man who is Rowan County’s seco...,0
7997,1241172153040502795,Governor Wolf delaying enforcement of non-life...,1
7998,1239740620194766848,The Sheriff's Department has reduced the jail ...,1


### Data Cleaning

In [6]:
def cleaning(text):
  text= text.lower()
  text= emoji.demojize(text)
  text=contractions.fix(text)
  text=text.strip()
  text=text.replace('[^\w\s]','')
  text=re.sub(r'http\S+', '', text)
  REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
  BAD_SYMBOLS_RE = re.compile('[^0-9a-z +]')
  text = REPLACE_BY_SPACE_RE.sub(' ' , text)
  text = BAD_SYMBOLS_RE.sub(' ',text)
  
  return text

clean=mix['Text'].apply(cleaning)
STOPWORDS = set(stopwords.words('english'))

ff=[]
for i in clean:
  text=unicodedata.normalize('NFKD', i).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  ff.append(text)
dd=pd.DataFrame(ff)
dataset = pd.concat([dd,mix['Label']],axis=1)
dataset

Unnamed: 0,0,Label
0,official death toll from covid19 in the unite...,0
1,dearest mr president user 1 169 coronavirus ...,0
2,latest updates march 20 warning selector 5274...,0
3,breaking 21 people on grand princess...,0
4,oklahoma city the state department of educat...,1
...,...,...
7995,coronavirus took hold in uk earlier than thoug...,1
7996,i talked with a man who is rowan county s seco...,0
7997,governor wolf delaying enforcement of non life...,1
7998,the sheriff s department has reduced the jail ...,1


### Lets tokenize the sentences using RobertaTokenizer and map those tokens to unique ID's
 


In [7]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)          #for every pre-trained model, it has its own tokenizer 
input_tokens = []

for sent in dataset[0]:                                                                   #Special Tokens are set True to identify the start and end of sentences.
    encoded_sent = tokenizer.encode(sent,add_special_tokens = True,add_prefix_space=True) #It is necessary in case of Roberta to add prefix space.
    input_tokens.append(encoded_sent)


### Lets pad input tokens with 0 value and also truncate the length of sentences to 100 tokens


In [8]:
MAX_LEN = 100
input_ids = pad_sequences(input_tokens, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")


### Attention masks are created to distinguish the actual token Id and padded Id.



In [9]:
attention_masks = []

for sent in input_ids:
    mask = [int(token_id > 0) for token_id in sent] #If a token ID is 0, then it's padding, set the mask to 0 else 1
    attention_masks.append(mask)

### Split the train dataset. Choosen ratio is 9:1

In [10]:
labels=dataset['Label']
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids[:7000], labels[:7000], 
                                                            random_state=0, test_size=0.1)

train_masks, validation_masks, _, _ = train_test_split(attention_masks[:7000], labels[:7000],
                                             random_state=0, test_size=0.1)

### Convert to tensors

In [11]:
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels.values)
validation_labels = torch.tensor(validation_labels.values)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

### Loading RobertaForSequenceClassification

In [12]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base",num_labels = 2,output_attentions = False,output_hidden_states = False)
                                                            #Number of Labels is set to 2 ( Informative and Uninformative )

model.cuda()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [13]:
#Batch Size is set to 32 
batch_size = 32
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size,)
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_dataloader = DataLoader(validation_data,  batch_size=batch_size)

### Optimization using AdamW

In [14]:
optimizer = AdamW(model.parameters(),lr = 2e-5,eps = 1e-8)
epochs = 4
total_steps = len(train_dataloader) * epochs                                                                #Total number of training steps is number of batches * number of epochs.
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps = 0,num_training_steps = total_steps)

In [15]:
#for averaging the predicted scores
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

#for noting the time of execution
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

### For reproducibility

In [16]:
seed_val = 0
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

### Training

In [17]:
loss_values = []                                                                              

for epoch_i in range(0, epochs):                                                  #for every epoch
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    t0 = time.time()
    total_loss = 0
    model.train()                                                                  
    
    for step, batch in enumerate(train_dataloader):
      if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
     
      b_input_ids = batch[0].to(device)
      b_input_mask = batch[1].to(device)
      b_labels = batch[2].to(device)
      model.zero_grad()        
      outputs = model(b_input_ids, 
                    token_type_ids=None,                                          #Token_type_Ids are not considered in case of Distilbert
                    attention_mask=b_input_mask, 
                    labels=b_labels)
      loss = outputs[0]
      total_loss += loss.item()
      loss.backward()                                                             # Perform a backward pass to calculate the gradients.
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)                     # Update parameters
      optimizer.step()
      scheduler.step()
   
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
   
    t0 = time.time()    
    model.eval() 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():        
            outputs = model(b_input_ids,token_type_ids=None,attention_mask=b_input_mask)
        
     
        logits = outputs[0]                                                       # The "logits" are the output to the model ; values prior to applying an activation function like the softmax.
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
            
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("Training complete!")

  Batch    40  of    197.    Elapsed: 0:00:13.
  Batch    80  of    197.    Elapsed: 0:00:25.
  Batch   120  of    197.    Elapsed: 0:00:38.
  Batch   160  of    197.    Elapsed: 0:00:50.
  Average training loss: 0.38
  Training epcoh took: 0:01:02
  Accuracy: 0.94
  Validation took: 0:00:02
  Batch    40  of    197.    Elapsed: 0:00:13.
  Batch    80  of    197.    Elapsed: 0:00:25.
  Batch   120  of    197.    Elapsed: 0:00:38.
  Batch   160  of    197.    Elapsed: 0:00:50.
  Average training loss: 0.15
  Training epcoh took: 0:01:02
  Accuracy: 0.95
  Validation took: 0:00:02
  Batch    40  of    197.    Elapsed: 0:00:13.
  Batch    80  of    197.    Elapsed: 0:00:25.
  Batch   120  of    197.    Elapsed: 0:00:38.
  Batch   160  of    197.    Elapsed: 0:00:50.
  Average training loss: 0.10
  Training epcoh took: 0:01:02
  Accuracy: 0.96
  Validation took: 0:00:02
  Batch    40  of    197.    Elapsed: 0:00:12.
  Batch    80  of    197.    Elapsed: 0:00:25.
  Batch   120  of    197.  

### Visualization of training loss of model

In [18]:
f = pd.DataFrame(loss_values)
f.columns=['Loss']
fig = px.line(f, x=f.index, y=f.Loss)
fig.update_layout(title='Training loss of the Model',xaxis_title='Epoch',yaxis_title='Loss')
fig.show()

### Prediction of labels for Valid data

In [19]:
prediction_inputs = torch.tensor(input_ids[7000:])
prediction_masks = torch.tensor(attention_masks[7000:])
prediction_labels = torch.tensor(labels[7000:].values)
 
batch_size = 32  
prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_dataloader = DataLoader(prediction_data, batch_size=batch_size)
model.eval()
predictions , true_labels = [], []

for batch in prediction_dataloader:
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels = batch
  with torch.no_grad():
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)
  logits = outputs[0]
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  predictions.append(logits)
  true_labels.append(label_ids)


### F1 scores

In [20]:
Score = []
for i in range(len(true_labels)):
  pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
  ss = f1_score(true_labels[i], pred_labels_i)                
  Score.append(ss)

flat_predictions = [item for sublist in predictions for item in sublist]          # Combine the predictions for each batch
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels = [item for sublist in true_labels for item in sublist]
F1_SCORE = f1_score(flat_true_labels, flat_predictions)
print( F1_SCORE)

0.8897715988083417
