https://discuss.huggingface.co/t/adding-features-to-a-pretrained-language-model/770/4
https://colab.research.google.com/drive/1eB8EMCwEE1_o5QOdC0gEejxqgkv6Q_cO?usp=sharing#scrollTo=QRwqVzOk0y7x

In [1]:
import torch
# If a GPU is available
if torch.cuda.is_available():    
    #set device to GPU   
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If no GPU is available
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 2 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 2080 Ti


In [2]:
#Importing necessary libraries

import re
import scipy
import pandas         as pd
import io
import numpy          as np
import copy
import seaborn        as sns

import transformers
from transformers                     import  RobertaModel, RobertaTokenizer, AdamW, get_linear_schedule_with_warmup
import torch

from tqdm import tqdm


from sklearn.metrics                  import classification_report
from sklearn.feature_extraction.text  import TfidfVectorizer
from sklearn.model_selection import train_test_split


from torch                            import nn, optim
from torch.utils                      import data
from sklearn.decomposition            import PCA

#Seeding for deterministic results
RANDOM_SEED = 64
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

if torch.cuda.is_available():
   torch.cuda.manual_seed(RANDOM_SEED)
   torch.cuda.manual_seed_all(RANDOM_SEED) 
   torch.backends.cudnn.deterministic = True  
   torch.backends.cudnn.benchmark = False


CLASS_NAMES = ['no-fake', 'fake']
MAX_LENGTH = 200
BATCH_SIZE = 8
EPOCHS = 6
HIDDEN_UNITS = 128

tokenizer = transformers.RobertaTokenizer.from_pretrained('roberta-large')  #Use roberta-large or roberta-base

In [3]:
train = pd.read_csv('preprocessed.csv')
train['label'] = train.label.astype(int)
train.drop(['subject','speaker','id'], axis=1, inplace=True)
train.head()

Unnamed: 0,label,statement,economy,health-care,taxes,federal-budget,education,jobs,state-budget,candidates-biography,...,state_info_Virginia,"state_info_Washington, D.C.",state_info_Wisconsin,state_info_other,party_affiliation_democrat,party_affiliation_independent,party_affiliation_none,party_affiliation_organization,party_affiliation_other,party_affiliation_republican
0,1,China is in the South China Sea and (building)...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,With the resources it takes to execute just ov...,0,1,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
2,0,The (Wisconsin) governor has proposed tax give...,0,0,1,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
3,1,Says her representation of an ex-boyfriend who...,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,0,At protests in Wisconsin against proposed coll...,0,1,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1


In [4]:
X_train, X_test = train_test_split(train, test_size=0.2, random_state=RANDOM_SEED)
X_train, X_dev = train_test_split(X_train, test_size=0.2, random_state=RANDOM_SEED)

In [5]:
# x_train = torch.tensor(X_train.values).float()
# x_dev = torch.tensor(X_dev.values).float()
# x_test = torch.tensor(X_test.values).float()
# #Converting prections for train, dev and test data to tensors
# y_train = torch.tensor(y_train)
# y_dev   = torch.tensor(y_dev)
# y_test  = torch.tensor(y_test)

In [6]:
#Creates a dataset which will be used to feed to RoBERTa
class FakeNewDataset(data.Dataset):

  def __init__(self, firstSeq, labelValue, extra_feats, tokenizer, max_len):
    self.firstSeq    = firstSeq      #First input sequence that will be supplied to RoBERTa
    self.labelValue  = labelValue    #label value for each training example in the dataset
    self.tokenizer   = tokenizer     #tokenizer that will be used to tokenize input sequences (Uses BERT-tokenizer here)
    self.max_len     = max_len       #Maximum length of the tokens from the input sequence that BERT needs to attend to
    self.extra_feats = extra_feats
    
  def __len__(self):
    return len(self.labelValue)

  def __getitem__(self, item):
    firstSeq    = str(self.firstSeq[item])
    
    
    #Encoding the first and the second sequence to a form accepted by RoBERTa
    #RoBERTa does not use token_type_ids to distinguish the first sequence from the second sequnece.
    encoding = tokenizer.encode_plus(
        firstSeq,
        max_length = self.max_len,
        add_special_tokens= True,
        truncation = True,
        pad_to_max_length = True,
        return_attention_mask = True,
        return_tensors = 'pt'
    )

    return {
        'firstSeq' : firstSeq,
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'labelValue'  : torch.tensor(self.labelValue[item], dtype=torch.long),
        'extra_features' : torch.tensor(self.extra_feats[item]).float()
    }


In [7]:
#Creates a data loader
def createDataLoader(dataframe, tokenizer, max_len, batch_size):
  ds = FakeNewDataset(
      firstSeq    = dataframe.statement.to_numpy(),
      labelValue  = dataframe.label.to_numpy(),
      extra_feats = dataframe.drop(['statement','label'],axis=1).to_numpy(),
      tokenizer   = tokenizer,
      max_len     = max_len
  )

  return data.DataLoader(
      ds,
      batch_size  = batch_size,
      shuffle     = True,
      num_workers = 4
  )


In [8]:
#Creating data loader for training data
trainDataLoader        = createDataLoader(X_train, tokenizer, MAX_LENGTH, BATCH_SIZE)

#Creating data loader for development data
developmentDataLoader  = createDataLoader(X_dev, tokenizer, MAX_LENGTH, BATCH_SIZE)

#Creating data loader for test data
testDataLoader         = createDataLoader(X_test, tokenizer, MAX_LENGTH, BATCH_SIZE)

In [9]:
'''This class defines the model that will be used for 
training and testing on the dataset.

Adapted from huggingFace
This RoBERTa model from huggingface outputs the last hidden states
and the pooled output by default. Pooled output is the classification 
token (1st token of the last hidden state) further processed by a Linear
layer and a Tanh activation function.

The pre-trained RoBERTa model is used as the primary model.
This class experiments with RoBERTa and its ensemble with TF-IDF features. 
roberta-only :            No ensembling. This just fine-tunes the RoBERTa model. 
                          The pooled output is passed through a linear layer and 
                          softmax function is finally used for preictions. 

roberta-tfIdf :           This model conatenates the 1st token of last-hidden layer
                          from RoBERTa with TF-IDF features. Various ways of this 
                          concatenation was experimented (using pooled output instead
                          of 1st token of last hidden layer etc)

roberta-pcaTfidf :        This model concatenates the pooled output from
                          RoBERTa with the PCA transformed vector.

roberta-preTrainedTfIdf : This model concatenates the pooled output from
                          RoBERTa with the hidden layer output from a pre-trained
                          SNN that was trained on TF-IDF features.

Used dropout to prevent over-fitting.'''

class FakeNewsClassifier(nn.Module):

  def __init__(self,  n_classes):
    super(FakeNewsClassifier, self).__init__()
    self.robertaModel              = RobertaModel.from_pretrained('roberta-large')    #use roberta-large or roberta-base

    self.drop                      = nn.Dropout(p = 0.3)

    self.output                    = nn.Linear(self.robertaModel.config.hidden_size, n_classes)

    self.input_size_extrafeats     = self.robertaModel.config.hidden_size + len(X_train.drop(['statement','label'],axis=1).columns)
    
    self.dense                     = nn.Linear( self.input_size_extrafeats,  self.input_size_extrafeats)
    self.out_proj                  = nn.Linear( self.input_size_extrafeats, n_classes)

    
    self.softmax                   = nn.Softmax(dim = 1)

  def forward(self, input_ids, attention_mask, extrafeats, modelType):
    
    roberta_output     = self.robertaModel(
        input_ids      = input_ids,               #Input sequence tokens
        attention_mask = attention_mask )         #Mask to avoid performing attention on padding tokens
    #print(roberta_output[1].shape)

    if modelType   == 'roberta-only':
      pooled_output = roberta_output[1]           #Using pooled output
      output        = self.drop(pooled_output)
      output        = self.output(output)

    elif modelType == 'roberta-extra':
      soutput = roberta_output[1]#---------        experimenting with pooled output 
      #soutput = roberta_output[0][:, 0, :]        #taking <s> token (equivalent to [CLS] token in BERT)
      x       = torch.cat((soutput, extrafeats) , dim=1)
      x       = self.drop(x)
      output  = self.out_proj(x)

    
    return self.softmax(output)



In [10]:
#Instantiating a StanceClassifier object as our model and loading the model onto the GPU.
model = FakeNewsClassifier(len(CLASS_NAMES))
model = model.to(device)
#print(model)

In [11]:
'''Using the same optimiser as used in BERT paper
with a different learning rate'''
optimizer = AdamW(model.parameters(), 
                  lr = 2e-6, 
                  correct_bias= False)

totalSteps = len(trainDataLoader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps = totalSteps
)

'''Using class-weights to accomodate heavily imbalanced data. 
These weights were learnt by running several experiments using 
other weights and the weights that produced the best results have
finally been used here'''

weights      = [1.0, 1.0]
classWeights = torch.FloatTensor(weights)
lossFunction = nn.CrossEntropyLoss(weight = classWeights).to(device)


In [12]:
#This function is used for training the model. 
def train_epoch(
  model,
  dataLoader,
  lossFunction,
  optimizer,
  device,
  scheduler,
  n_examples
):

  model = model.train()
  losses = []
  correctPredictions = 0

  for d in tqdm(dataLoader):
    
    input_ids              = d["input_ids"].to(device)                           #Loading input ids to GPU
    attention_mask         = d["attention_mask"].to(device)                      #Loading attention mask to GPU
    labelValues            = d["labelValue"].to(device)                          #Loading label value to GPU
    extra_feats            = d["extra_features"]                                    
    extra_feats = extra_feats.to(device)
    

    #Getting the output from our model (Object of StanceClassification class) for train data
    outputs = model(
      input_ids             = input_ids,
      attention_mask        = attention_mask,
      extrafeats    = extra_feats,
      modelType             = 'roberta-extra'
    )

    #Determining the model predictions
    _, predictionIndices = torch.max(outputs, dim=1)
    loss = lossFunction(outputs, labelValues)

    #Calculating the correct predictions for accuracy
    correctPredictions += torch.sum(predictionIndices == labelValues)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return np.mean(losses), correctPredictions.double() / n_examples


In [13]:
#This function is used for evaluating the model on the development and test set
def eval_model(
    model, 
    dataLoader, 
    lossFunction,
    device,
    n_examples,
    dev = False
    ):
  
  model = model.eval()
  losses = []
  correctPredictions = 0

  with torch.no_grad():
    for d in tqdm(dataLoader):

      input_ids              = d["input_ids"].to(device)                          #Loading input ids to GPU
      attention_mask         = d["attention_mask"].to(device)                     #Loading attention mask to GPU
      labelValues            = d["labelValue"].to(device)                         #Loading label values to GPU
      extra_feats            = d["extra_features"]                                    
      extra_feats = extra_feats.to(device)

      #Getting the softmax output from model for dev data
      outputs = model(
        input_ids             = input_ids,
        attention_mask        = attention_mask,
        extrafeats    = extra_feats,
        modelType             = 'roberta-extra'
      )

      #Determining the model predictions
      _, predictionIndices = torch.max(outputs, dim=1)
      loss = lossFunction(outputs, labelValues)

      #Calculating the correct predictions for accuracy
      correctPredictions += torch.sum(predictionIndices == labelValues)
      losses.append(loss.item())

  return np.mean(losses), correctPredictions.double() / n_examples


In [14]:
#fine tuning ROBERTa and validating it 

for epoch in range(EPOCHS):
  print(f'Epoch {epoch + 1}')
  trainLoss, trainAccuracy = train_epoch(
    model,
    trainDataLoader,
    lossFunction,
    optimizer,
    device,
    scheduler,
    len(X_train)
  )
  
  print(f'Training loss {trainLoss} Training accuracy {trainAccuracy}')

  devLoss, devAccuracy = eval_model(
    model,
    developmentDataLoader,
    lossFunction,
    device,
    len(X_dev),
    dev = True
  )

  print(f'Development loss {devLoss} Development accuracy {devAccuracy}')
  print()
  
  print()


  0%|          | 0/716 [00:00<?, ?it/s]

Epoch 1


100%|██████████| 716/716 [05:06<00:00,  2.34it/s]
  0%|          | 0/179 [00:00<?, ?it/s]

Training loss 0.6378429572841975 Training accuracy 0.6466480446927374


100%|██████████| 179/179 [00:22<00:00,  7.94it/s]
  0%|          | 0/716 [00:00<?, ?it/s]

Development loss 0.618541777300435 Development accuracy 0.6585195530726258


Epoch 2


100%|██████████| 716/716 [05:10<00:00,  2.30it/s]
  0%|          | 0/179 [00:00<?, ?it/s]

Training loss 0.6133500358412386 Training accuracy 0.678945530726257


100%|██████████| 179/179 [00:22<00:00,  7.90it/s]
  0%|          | 0/716 [00:00<?, ?it/s]

Development loss 0.604751403271819 Development accuracy 0.6822625698324023


Epoch 3


100%|██████████| 716/716 [05:10<00:00,  2.31it/s]
  0%|          | 0/179 [00:00<?, ?it/s]

Training loss 0.5939404240616873 Training accuracy 0.7068784916201117


100%|██████████| 179/179 [00:22<00:00,  7.98it/s]
  0%|          | 0/716 [00:00<?, ?it/s]

Development loss 0.6138293358533742 Development accuracy 0.6843575418994413


Epoch 4


100%|██████████| 716/716 [05:09<00:00,  2.31it/s]
  0%|          | 0/179 [00:00<?, ?it/s]

Training loss 0.5818448495848219 Training accuracy 0.7245111731843575


100%|██████████| 179/179 [00:22<00:00,  7.94it/s]
  0%|          | 0/716 [00:00<?, ?it/s]

Development loss 0.6171875392924474 Development accuracy 0.6829608938547487


Epoch 5


100%|██████████| 716/716 [05:10<00:00,  2.30it/s]
  0%|          | 0/179 [00:00<?, ?it/s]

Training loss 0.5660379976034164 Training accuracy 0.7381284916201117


100%|██████████| 179/179 [00:22<00:00,  7.85it/s]
  0%|          | 0/716 [00:00<?, ?it/s]

Development loss 0.6206617401964838 Development accuracy 0.6815642458100559


Epoch 6


100%|██████████| 716/716 [05:09<00:00,  2.31it/s]
  0%|          | 0/179 [00:00<?, ?it/s]

Training loss 0.5520178599897043 Training accuracy 0.7552374301675978


100%|██████████| 179/179 [00:22<00:00,  7.95it/s]

Development loss 0.622540844219357 Development accuracy 0.678072625698324







In [15]:
#This function gets the predictions from the model after it is trained.
def get_predictions(model, data_loader):

  model = model.eval()

  predictions = []
  prediction_probs = []
  real_values = []

  with torch.no_grad():
    for d in tqdm(data_loader):


      input_ids              = d["input_ids"].to(device)
      attention_mask         = d["attention_mask"].to(device)
      labels                 = d["labelValue"].to(device)
      extra_feats            = d["extra_features"].to(device)                                

      #Getting the softmax output from model
      outputs = model(
        input_ids             = input_ids,
        attention_mask        = attention_mask,
        extrafeats    = extra_feats,
        modelType             = 'roberta-extra'
      )

      _, preds = torch.max(outputs, dim=1)     #Determining the model predictions


      predictions.extend(preds)
      prediction_probs.extend(outputs)
      real_values.extend(labels)

  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  
  return predictions, prediction_probs, real_values

In [16]:
#Getting model predictions on dev dataset
yHat_dev, predProbs_dev, yTest_dev = get_predictions(
  model,
  developmentDataLoader
)

100%|██████████| 179/179 [00:21<00:00,  8.28it/s]


In [17]:
  #Printing classification report for dev dataset (Evaluating the model on Dev set)
print(classification_report(yTest_dev, yHat_dev, target_names= CLASS_NAMES))

              precision    recall  f1-score   support

     no-fake       0.55      0.44      0.49       501
        fake       0.73      0.81      0.76       931

    accuracy                           0.68      1432
   macro avg       0.64      0.62      0.63      1432
weighted avg       0.67      0.68      0.67      1432



In [19]:
#Getting model predictions on test dataset
yHat_test, predProbs_test, yTest_test = get_predictions(
  model,
  testDataLoader
)

100%|██████████| 224/224 [00:25<00:00,  8.77it/s]


In [20]:
#Printing classification report for test dataset (Evaluating the model on test set)
print(classification_report(yTest_test, yHat_test, target_names= CLASS_NAMES))

              precision    recall  f1-score   support

     no-fake       0.54      0.44      0.48       629
        fake       0.72      0.80      0.76      1161

    accuracy                           0.67      1790
   macro avg       0.63      0.62      0.62      1790
weighted avg       0.66      0.67      0.66      1790



In [22]:
torch.save(model, './results/custom_transformer')

In [66]:
test_df = pd.read_csv('test_preprocessed.csv')
test_df.head()

Unnamed: 0,id,statement,subject,speaker,economy,health-care,taxes,federal-budget,education,jobs,...,state_info_Virginia,"state_info_Washington, D.C.",state_info_Wisconsin,state_info_other,party_affiliation_democrat,party_affiliation_independent,party_affiliation_none,party_affiliation_organization,party_affiliation_other,party_affiliation_republican
0,dc32e5ffa8b,Five members of [the Common Cause Georgia] boa...,"campaign-finance,ethics,government-regulation",kasim-reed,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,aa49bb41cab,Theres no negative advertising in my campaign ...,elections,bill-mccollum,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,dddc8d12ac1,Leticia Van de Putte voted to give illegal imm...,"health-care,immigration,public-health",dan-patrick,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,bcfe8f51667,Fiorinas plan would mean slashing Social Secur...,"federal-budget,medicare,social-security",barbara-boxer,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
4,eedbbaff5ab,"By the end of his first term, President Obama ...","federal-budget,new-hampshire-2012",mitt-romney,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [67]:
test_df['label'] = 1

In [68]:
ids = test_df.id.copy()
test_df.drop(columns=['id','subject','speaker'], inplace=True)

In [69]:
#Creating data loader for test data
testDataLoader_real         = createDataLoader(test_df, tokenizer, MAX_LENGTH, BATCH_SIZE)

In [70]:
#Getting model predictions on test dataset
yHat_test, predProbs_test, yTest_test = get_predictions(
  model,
  testDataLoader_real
)

100%|██████████| 480/480 [00:54<00:00,  8.87it/s]


In [71]:
label = pd.Series(yHat_test.tolist())

In [72]:
submission = pd.DataFrame(ids)
submission

Unnamed: 0,id
0,dc32e5ffa8b
1,aa49bb41cab
2,dddc8d12ac1
3,bcfe8f51667
4,eedbbaff5ab
...,...
3831,e050483b866
3832,6221e28aa63
3833,954dc0f0b5d
3834,2fa476b0d2f


In [73]:
submission['label']=label

In [74]:
submission.to_csv('submission_custom_transformer.csv',index=False)
submission

Unnamed: 0,id,label
0,dc32e5ffa8b,0
1,aa49bb41cab,1
2,dddc8d12ac1,1
3,bcfe8f51667,1
4,eedbbaff5ab,1
...,...,...
3831,e050483b866,1
3832,6221e28aa63,1
3833,954dc0f0b5d,1
3834,2fa476b0d2f,0
