# Important Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import re

In [3]:
import nltk
nltk.download("stopwords")
nltk.download("punkt")  # Punkt Sentence Tokenizer
nltk.download("averaged_perceptron_tagger")  # Part of Speech Tagger
nltk.download("wordnet")  # a lexical database of English; useful for synonyms, hyponyms, etc.

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/marneusz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/marneusz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/marneusz/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/marneusz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

# Loading Data

In [5]:
CUR_DATASET = "LIAR-PLUS"

In [6]:
train_dataset = pd.read_csv(f'../data/{CUR_DATASET}/train2.tsv', sep='\t', header = None)
valid_dataset = pd.read_csv(f'../data/{CUR_DATASET}/val2.tsv', sep='\t', header = None)
test_dataset = pd.read_csv(f'../data/{CUR_DATASET}/test2.tsv', sep='\t', header = None)

In [7]:
train = train_dataset.iloc[:, [2, 3, 15]]
train = train.rename(columns = {2: 'label', 3: 'statements', 15: 'justification'})

val = valid_dataset.iloc[:, [2, 3, 15]]
val = val.rename(columns = {2: 'label', 3: 'statements', 15: 'justification'})

In [8]:
for dataset in [train, val]:
    dataset['label'] = dataset['label'].replace({
        'false' : 0,
        'barely-true' : 0,
        'pants-fire' : 0,
        'half-true' : 1,
        'mostly-true' : 1,
        'true' : 1
    })

# Some More EDA

In [10]:
train.isnull().sum()

label             2
statements        2
justification    88
dtype: int64

In [11]:
for dataset in [train, val]:
    dataset = dataset.dropna(inplace=True)

In [12]:
train_labels = train["label"].values.astype(int)
val_labels = val["label"].values.astype(int)

In [13]:
np.unique(val_labels)

array([0, 1])

# Data Preprocessing and Data Preparation

### Removing stopwords

In [14]:
stop_words = stopwords.words('english')

In [15]:
if_stopwords = True

### Preprocessing

In [16]:
# from num2words import num2words

In [17]:
preprocessing_text_fn = {
    "no_punctuation": lambda txt: re.sub(r'[^\w\s]','', txt),
    "no_special_symbols": lambda txt: re.sub('[$,#,&]', '', txt),
    # "no_digits": lambda txt: re.sub('\d*', '', txt),
    "no_www": lambda txt: re.sub('w{3}', '', txt),
    "no_urls": lambda txt: re.sub('http\S+', '', txt),
    "no_spaces": lambda txt: re.sub('\s+', ' ', txt),
    "no_single_chars": lambda txt: re.sub(r'\s+[a-zA-Z]\s+', ' ', txt)
}

In [18]:
def preprocess_text(text, pipeline = preprocessing_text_fn):
    text = str(text)
    for fn in pipeline.keys():
        text = pipeline[fn](text)
        
    return text

In [19]:
for dataset in [train, val]:
    dataset["statements"] = dataset["statements"].apply(preprocess_text)
    dataset["justification"] = dataset["justification"].apply(preprocess_text)

In [20]:
if if_stopwords:
    for dataset in [train, val]:
        for col in ["statements", "justification"]:
            dataset[col] = dataset[col].str.lower().str.replace("’", "'")
            dataset[col] = dataset[col].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

### Lemmatization and Stemming

In [21]:
if_lemmatize = True

In [22]:
if if_lemmatize:
    
    import nltk
    nltk.download('wordnet')
    nltk.download('omw-1.4')
    from nltk.stem import WordNetLemmatizer
    
    wnl = WordNetLemmatizer()
    
    for dataset in [train, val]:
        for col in ["statements", "justification"]:
            dataset[col] = dataset[col].str.lower().str.replace("’", "'")
            dataset[col] = dataset[col].apply(lambda x: ' '.join([wnl.lemmatize(word) for word in word_tokenize(x)]))
    

[nltk_data] Downloading package wordnet to /home/marneusz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/marneusz/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [23]:
train_text = (train["statements"] + " " + train["justification"]).values
val_text = (val["statements"] + " " + val["justification"]).values

# Model Initialization

In [24]:
from tqdm import tqdm

In [25]:
import torch
if torch.cuda.is_available():    
    device = torch.device('cuda')    
print(device)

  from .autonotebook import tqdm as notebook_tqdm


cuda


In [26]:
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig, get_linear_schedule_with_warmup
from torch.optim import AdamW
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

In [27]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, precision_score, recall_score

In [28]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)

In [29]:
np.unique(train_labels)

array([0, 1])

In [30]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', # Use the 124-layer, 1024-hidden, 16-heads, 340M parameters BERT model with an uncased vocab.
    num_labels = len(np.unique(train_labels)), 
    output_attentions = False, 
    output_hidden_states = False, 
)
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [32]:
max_len = 0
len_limit = 512
LIMIT = 100_000

indices = []
train_text_filtered = []

for i, text in enumerate(tqdm(train_text)):
    input_ids = tokenizer.encode(text, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))
    if len(input_ids) <= LIMIT:
        train_text_filtered.append(text)
        indices.append(i)
print('Max sentence length: ', max_len)

  0%|                                                                                                                | 0/10154 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2089 > 512). Running this sequence through the model will result in indexing errors
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 10154/10154 [00:04<00:00, 2066.62it/s]

Max sentence length:  2089





In [107]:
# labels_filtered = train_labels[indices]
# labels_filtered.shape


torch.Size([10154])

In [33]:
# https://www.kaggle.com/code/jeongwonkim10516/nlp-fake-news-with-bert-99-55-top1/notebook

def tokenize_map(sentence, labs='None'):
    
    """A function for tokenize all of the sentences and map the tokens to their word IDs."""
    
    input_ids = []
    attention_masks = []

    # For every sentence...
    
    for text in tqdm(sentence):
        #   "encode_plus" will:
        
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        #   (5) Pad or truncate the sentence to `max_length`
        #   (6) Create attention masks for [PAD] tokens.
        
        encoded_dict = tokenizer.encode_plus(
                            text,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            truncation='longest_first', # Activate and control truncation
                            max_length = len_limit,           # Max length according to our text data.
                            padding = 'max_length', # Pad & truncate all sentences.
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                       )

        # Add the encoded sentence to the id list. 
        
        input_ids.append(encoded_dict['input_ids'])

        # And its attention mask (simply differentiates padding from non-padding).
        
        attention_masks.append(encoded_dict['attention_mask'])
        
    # Convert the lists into tensors.
    
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    if labs != 'None': # Setting this for using this definition for both train and test data so labels won't be a problem in our outputs.
        labels = torch.tensor(labels)
        return input_ids, attention_masks
    
    return input_ids, attention_masks

In [34]:
train_text_filtered = np.array(train_text)
val_text_filtered = np.array(val_text)

In [35]:
train_text.shape, train_text_filtered.shape, val_text.shape, val_text_filtered.shape

((10154,), (10154,), (1280,), (1280,))

In [36]:
input_ids, attention_masks = tokenize_map(train_text_filtered)
train_labels = torch.tensor(train_labels)

val_input_ids, val_attention_masks = tokenize_map(val_text_filtered)
val_labels = torch.tensor(val_labels)

# test_input_ids, test_attention_masks= tokenize_map(test_text)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 10154/10154 [00:05<00:00, 1785.53it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 1280/1280 [00:00<00:00, 1861.35it/s]


## Train and Validation Dataset

In [37]:
import transformers

In [38]:
seed = 10
transformers.set_seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7f55197e6e90>

In [39]:
type(attention_masks)

torch.Tensor

In [41]:
train_labels.shape, input_ids.shape

(torch.Size([10154]), torch.Size([10154, 512]))

In [42]:
train_dataset = TensorDataset(input_ids, attention_masks, train_labels)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)

In [43]:
# Train DataLoader
batch_size = 4

train_dataloader = DataLoader(
            train_dataset,  
            sampler = RandomSampler(train_dataset), 
            batch_size = batch_size 
        )

# Validation DataLoader
validation_dataloader = DataLoader(
            val_dataset, 
            sampler = SequentialSampler(val_dataset), 
            batch_size = batch_size 
)

In [44]:
# # Test DataLoader

# test_data = TensorDataset(test_input_ids, test_attention_masks)
# test_sampler = SequentialSampler(test_data)
# test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

## Hyperparameters

In [44]:
from torch.optim import lr_scheduler

In [45]:
optimizer = AdamW(model.parameters(),
                  lr = 1e-5, # args.learning_rate
                  # eps = 1e-8 # args.adam_epsilon
            )

lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.5)

In [46]:
num_epochs = 20
total_num_steps = len(train_dataloader) * num_epochs

## Metrics

In [47]:
def accuracy(predictions, labels):
    pred_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = labels.flatten()
    
    return accuracy_score(labels_flat, pred_flat)

def flat_f1_score(predictions, labels):
    pred_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = labels.flatten()
    

    return f1_score(labels_flat, pred_flat, zero_division=0)

## Training

In [48]:
import time

In [49]:
training_stats = []
validations_labels_ep = []
actual_labels_ep = []

total_t0 = time.time()
for i in range(0, num_epochs):
    print('')
    print('Training...')
    print('----- Epoch {:} / {:} -----'.format(i + 1, num_epochs))

    t0 = time.time()
    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):

        if step % 500 == 0 and not step == 0:
            elapsed = time.time() - t0
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device).to(torch.int64)
        b_input_mask = batch[1].to(device).to(torch.int64)
        b_labels = batch[2].to(device).to(torch.int64)
        
        model.zero_grad()        

        loss = model(b_input_ids, 
                     token_type_ids=None, 
                     attention_mask=b_input_mask,
                     labels=b_labels)[0]
        logits = model(b_input_ids, 
                       token_type_ids=None, 
                       attention_mask=b_input_mask,
                       labels=b_labels)[1]

        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_dataloader)            
    training_time = time.time() - t0

    print('')
    print('  Average training loss: {0:.2f}'.format(avg_train_loss))
    print('  Training epoc h took: {:}'.format(training_time))
    
    print('')
    print('Running Validation...')

    t0 = time.time()
    
    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    total_eval_f1 = 0
    nb_eval_steps = 0
    
    for batch in validation_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad():        

            loss = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)[0]

            logits = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)[1]
            
        # Accumulate the validation loss.
        
        total_eval_loss += loss.item()

        # Move logits and labels to CPU:
        
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and accumulate it over all batches:
        
        total_eval_accuracy += accuracy(logits, label_ids)
        total_eval_f1 += flat_f1_score(logits, label_ids)
        

    # Report the final accuracy for this validation run.
    
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print('  Accuracy: {0:.5f}'.format(avg_val_accuracy))
    
    # Report the final f1 score for this validation run.
    
    avg_val_f1 = total_eval_f1 / len(validation_dataloader)
    print('  F1: {0:.5f}'.format(avg_val_f1))

    # Calculate the average loss over all of the batches.
    
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    lr_scheduler.step()
    
    # Measure how long the validation run took:
    
    validation_time = time.time() - t0
    
    print('  Validation Loss: {0:.5f}'.format(avg_val_loss))
    print('  Validation took: {:}'.format(validation_time))

    # Record all statistics from this epoch.
    
    training_stats.append(
        {
            'epoch': i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Val_F1' : avg_val_f1,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )


Training...
----- Epoch 1 / 20 -----
  Batch   500  of  2,539.    Elapsed: 175.45674347877502.
  Batch 1,000  of  2,539.    Elapsed: 349.02380657196045.
  Batch 1,500  of  2,539.    Elapsed: 523.5164775848389.
  Batch 2,000  of  2,539.    Elapsed: 697.1212501525879.
  Batch 2,500  of  2,539.    Elapsed: 871.1205580234528.

  Average training loss: 0.68
  Training epoc h took: 884.4071106910706

Running Validation...
  Accuracy: 0.61406
  F1: 0.61682
  Validation Loss: 0.67573
  Validation took: 47.516018867492676

Training...
----- Epoch 2 / 20 -----
  Batch   500  of  2,539.    Elapsed: 175.65511393547058.
  Batch 1,000  of  2,539.    Elapsed: 353.0663757324219.
  Batch 1,500  of  2,539.    Elapsed: 529.5510084629059.
  Batch 2,000  of  2,539.    Elapsed: 701.7482740879059.
  Batch 2,500  of  2,539.    Elapsed: 874.0486047267914.

  Average training loss: 0.64
  Training epoc h took: 887.2981009483337

Running Validation...
  Accuracy: 0.62187
  F1: 0.60906
  Validation Loss: 0.67344

  Accuracy: 0.60703
  F1: 0.58728
  Validation Loss: 2.64544
  Validation took: 47.14756917953491

Training...
----- Epoch 17 / 20 -----
  Batch   500  of  2,539.    Elapsed: 172.1030035018921.
  Batch 1,000  of  2,539.    Elapsed: 344.20143008232117.
  Batch 1,500  of  2,539.    Elapsed: 516.3800897598267.
  Batch 2,000  of  2,539.    Elapsed: 688.7685272693634.
  Batch 2,500  of  2,539.    Elapsed: 861.0287554264069.

  Average training loss: 0.18
  Training epoc h took: 874.3484251499176

Running Validation...
  Accuracy: 0.60781
  F1: 0.58790
  Validation Loss: 2.65395
  Validation took: 47.11502647399902

Training...
----- Epoch 18 / 20 -----
  Batch   500  of  2,539.    Elapsed: 172.2247109413147.
  Batch 1,000  of  2,539.    Elapsed: 344.41288709640503.
  Batch 1,500  of  2,539.    Elapsed: 516.6125838756561.
  Batch 2,000  of  2,539.    Elapsed: 688.7442991733551.
  Batch 2,500  of  2,539.    Elapsed: 860.8153083324432.

  Average training loss: 0.17
  Training epoc h took: 874

In [50]:
model.save_pretrained(f"./models/bert_{CUR_DATASET}_regexp_stopwords_{if_stopwords}_lemmatization_{if_lemmatize}_binary_SJ_2")