# Important Imports

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [32]:
import re

In [33]:
import nltk
nltk.download("stopwords")
nltk.download("punkt")  # Punkt Sentence Tokenizer
nltk.download("averaged_perceptron_tagger")  # Part of Speech Tagger
nltk.download("wordnet")  # a lexical database of English; useful for synonyms, hyponyms, etc.

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/marneusz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/marneusz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/marneusz/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/marneusz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [34]:
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

# Loading Data

In [35]:
CUR_DATASET = "CT-FAN"

In [36]:
data_dev = pd.read_csv(f'../data/{CUR_DATASET}/Task3_english_dev.csv')
data_train = pd.read_csv(f'../data/{CUR_DATASET}/Task3_english_training.csv')
data_test = pd.read_csv(f'../data/{CUR_DATASET}/English_data_test_release_with_rating.csv')

data_concat = pd.concat([data_train, data_dev])
data_concat.rename(columns={'our rating':'label'}, inplace=True)
data_test.rename(columns={'our rating':'label'}, inplace=True)
data_concat['label'] = data_concat['label'].apply(lambda x: x.lower())
data_test['label'] = data_test['label'].apply(lambda x: x.lower())

train_dataset = data_concat
test_dataset = data_test

In [37]:
train_dataset.head()

Unnamed: 0,public_id,text,title,label
0,5a228e0e,Distracted driving causes more deaths in Canad...,"You Can Be Fined $1,500 If Your Passenger Is U...",false
1,30c605a1,Missouri politicians have made statements afte...,Missouri lawmakers condemn Las Vegas shooting,partially false
2,c3dea290,Home Alone 2: Lost in New York is full of viol...,CBC Cuts Donald Trump's 'Home Alone 2' Cameo O...,partially false
3,f14e8eb6,But things took a turn for the worse when riot...,Obama’s Daughters Caught on Camera Burning US ...,false
4,faf024d6,It’s no secret that Epstein and Schiff share a...,Leaked Visitor Logs Reveal Schiff’s 78 Visits ...,false


In [38]:
train_dataset = train_dataset.fillna("null data")
test_dataset = test_dataset.fillna("null data")

In [39]:
np.unique(train_dataset["label"])

array(['false', 'other', 'partially false', 'true'], dtype=object)

In [40]:
train_dataset['label'] = train_dataset['label'].replace({
    'false' : 0,
    'partially false' : 1,
    'true' : 2,
    'other' : 3,
})

test_dataset['label'] = test_dataset['label'].replace({
    'false' : 0,
    'partially false' : 1,
    'true' : 2,
    'other' : 3,
})

In [41]:
train_labels = train_dataset["label"].values.astype(int)
test_labels = test_dataset["label"].values.astype(int)

# Data Preprocessing and Data Preparation

### Removing stopwords

In [42]:
stop_words = stopwords.words('english')

In [43]:
if_stopwords = True

### Preprocessing

In [44]:
# from num2words import num2words

In [45]:
preprocessing_text_fn = {
    "no_punctuation": lambda txt: re.sub(r'[^\w\s]','', txt),
    "no_special_symbols": lambda txt: re.sub('[$,#,&]', '', txt),
    # "no_digits": lambda txt: re.sub('\d*', '', txt),
    "no_www": lambda txt: re.sub('w{3}', '', txt),
    "no_urls": lambda txt: re.sub('http\S+', '', txt),
    "no_spaces": lambda txt: re.sub('\s+', ' ', txt),
    "no_single_chars": lambda txt: re.sub(r'\s+[a-zA-Z]\s+', ' ', txt)
}

In [46]:
def preprocess_text(text, pipeline = preprocessing_text_fn):
    text = str(text)
    for fn in pipeline.keys():
        text = pipeline[fn](text)
        
    return text

In [47]:
train_dataset["title"] = train_dataset["title"].apply(preprocess_text)
train_dataset["text"] = train_dataset["text"].apply(preprocess_text)

test_dataset["title"] = test_dataset["title"].apply(preprocess_text)
test_dataset["text"] = test_dataset["text"].apply(preprocess_text)


In [48]:
if if_stopwords:
    for dataset in [train_dataset, test_dataset]:
        for col in ["title", "text"]:

            dataset[col] = dataset[col].str.lower().str.replace("’", "'")
            dataset[col] = dataset[col].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))


### Lemmatization and Stemming

In [49]:
if_lemmatize = True

In [50]:
if if_lemmatize:
    
    import nltk
    nltk.download('wordnet')
    nltk.download('omw-1.4')
    from nltk.stem import WordNetLemmatizer
    
    wnl = WordNetLemmatizer()
    
    for dataset in [train_dataset, test_dataset]:
        for col in ["title", "text"]:
            dataset[col] = dataset[col].str.lower().str.replace("’", "'")
            dataset[col] = dataset[col].apply(lambda x: ' '.join([wnl.lemmatize(word) for word in word_tokenize(x)]))
    

[nltk_data] Downloading package wordnet to /home/marneusz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/marneusz/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [51]:
train_text = (train_dataset["title"] + " " + train_dataset["text"]).values
test_text = (test_dataset["title"] + " " + test_dataset["text"]).values

# Model Initialization

In [52]:
from tqdm import tqdm

In [53]:
import torch
if torch.cuda.is_available():    
    device = torch.device('cuda')    
print(device)

cuda


In [54]:
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig, get_linear_schedule_with_warmup
from torch.optim import AdamW
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

In [55]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, precision_score, recall_score, balanced_accuracy_score

In [56]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)

In [57]:
print(' Original: ', train_text[0])
print('Tokenized: ', tokenizer.tokenize(train_text[0]))
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_text[0])))

 Original:  fined 1500 passenger using mobile phone starting next week distracted driving cause death canada impaired driving every province territory law driving operating cell phone tell passenger stay phone driving measure necessary distracted driving claimed life impaired driving province like british columbia ontario quebec alberta nova scotia manitoba newfoundland labrador mobile phone even held passenger dangerous distraction driver starting next week distracted screen held passenger attracts penalty 1500 three demerit point driver screen mix doesnt matter holding device using facetime taking selfies driver showing driver funny cat video nono province mobile phone categorised visual display unit meaning considered akin television screen important practice safe driving sake fellow driver canada cracking distracted driving problem rollout stricter law impose harsher penalty heftier fine guilty offender taking effect next week add serious penalty convicted distracted driving
Tokeni

In [58]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', # Use the 124-layer, 1024-hidden, 16-heads, 340M parameters BERT model with an uncased vocab.
    num_labels = len(np.unique(labels)), 
    output_attentions = False, 
    output_hidden_states = False, 
)
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [59]:
max_len = 0
len_limit = 512
LIMIT = 100_000

indices = []
train_text_filtered = []

for i, text in enumerate(tqdm(train_text)):
    input_ids = tokenizer.encode(text, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))
    if len(input_ids) <= LIMIT:
        train_text_filtered.append(text)
        indices.append(i)
print('Max sentence length: ', max_len)

  0%|                                                                                                               | 0/1264 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (529 > 512). Running this sequence through the model will result in indexing errors
100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 1264/1264 [00:04<00:00, 258.45it/s]

Max sentence length:  4905





In [62]:
# https://www.kaggle.com/code/jeongwonkim10516/nlp-fake-news-with-bert-99-55-top1/notebook

def tokenize_map(sentence, labs='None'):
    
    """A function for tokenize all of the sentences and map the tokens to their word IDs."""
    
    global labels
    
    input_ids = []
    attention_masks = []

    # For every sentence...
    
    for text in tqdm(sentence):
        #   "encode_plus" will:
        
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        #   (5) Pad or truncate the sentence to `max_length`
        #   (6) Create attention masks for [PAD] tokens.
        
        encoded_dict = tokenizer.encode_plus(
                            text,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            truncation='longest_first', # Activate and control truncation
                            max_length = len_limit,           # Max length according to our text data.
                            padding = 'max_length', # Pad & truncate all sentences.
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                       )

        # Add the encoded sentence to the id list. 
        
        input_ids.append(encoded_dict['input_ids'])

        # And its attention mask (simply differentiates padding from non-padding).
        
        attention_masks.append(encoded_dict['attention_mask'])
        
    # Convert the lists into tensors.
    
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    if labs != 'None': # Setting this for using this definition for both train and test data so labels won't be a problem in our outputs.
        labels = torch.tensor(labels)
        return input_ids, attention_masks, labels
    
    return input_ids, attention_masks

In [None]:
train_text_filtered = np.array(train_text)

In [None]:
train_text.shape, train_text_filtered.shape,

In [None]:
input_ids, attention_masks, labels_filtered = tokenize_map(train_text_filtered, labels_filtered)
# test_input_ids, test_attention_masks= tokenize_map(test_text)

## Train and Validation Dataset

In [None]:
import transformers

In [None]:
seed = 10
transformers.set_seed(seed)
torch.manual_seed(seed)

In [None]:
type(attention_masks)

In [None]:
labels_filtered.shape, input_ids.shape

In [None]:
labels_filtered = labels[indices]
labels_filtered.shape

In [None]:
dataset = TensorDataset(input_ids, attention_masks, labels_filtered)

In [None]:
torch.manual_seed(seed)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_size, val_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [None]:
# Train DataLoader
batch_size = 4

train_dataloader = DataLoader(
            train_dataset,  
            sampler = RandomSampler(train_dataset), 
            batch_size = batch_size 
        )

# Validation DataLoader
validation_dataloader = DataLoader(
            val_dataset, 
            sampler = SequentialSampler(val_dataset), 
            batch_size = batch_size 
)

In [61]:
# # Test DataLoader

# test_data = TensorDataset(test_input_ids, test_attention_masks)
# test_sampler = SequentialSampler(test_data)
# test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

## Hyperparameters

In [45]:
optimizer = AdamW(model.parameters(),
                  lr = 1e-5, # args.learning_rate
                  eps = 1e-8 # args.adam_epsilon
            )

In [46]:
num_epochs = 10
total_num_steps = len(train_dataloader) * num_epochs

## Metrics

In [47]:
def accuracy(predictions, labels):
    pred_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = labels.flatten()
    
    return balanced_accuracy_score(labels_flat, pred_flat)

def flat_f1_score(predictions, labels):
    pred_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = labels.flatten()
    

    return f1_score(labels_flat, pred_flat, zero_division=0, average="weighted")

## Training

In [48]:
import time

In [49]:
training_stats = []
validations_labels_ep = []
actual_labels_ep = []

total_t0 = time.time()
for i in range(0, num_epochs):
    print('')
    print('Training...')
    print('----- Epoch {:} / {:} -----'.format(i + 1, num_epochs))

    t0 = time.time()
    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):

        if step % 50 == 0 and not step == 0:
            elapsed = time.time() - t0
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device).to(torch.int64)
        b_input_mask = batch[1].to(device).to(torch.int64)
        b_labels = batch[2].to(device).to(torch.int64)
        
        model.zero_grad()        

        loss = model(b_input_ids, 
                     token_type_ids=None, 
                     attention_mask=b_input_mask,
                     labels=b_labels)[0]
        logits = model(b_input_ids, 
                       token_type_ids=None, 
                       attention_mask=b_input_mask,
                       labels=b_labels)[1]

        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        # scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)            
    training_time = time.time() - t0

    print('')
    print('  Average training loss: {0:.2f}'.format(avg_train_loss))
    print('  Training epoc h took: {:}'.format(training_time))
    
    print('')
    print('Running Validation...')

    t0 = time.time()
    
    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    total_eval_f1 = 0
    nb_eval_steps = 0
    
    for batch in validation_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad():        

            loss = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)[0]

            logits = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)[1]
            
        # Accumulate the validation loss.
        
        total_eval_loss += loss.item()

        # Move logits and labels to CPU:
        
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and accumulate it over all batches:
        
        total_eval_accuracy += accuracy(logits, label_ids)
        total_eval_f1 += flat_f1_score(logits, label_ids)
        

    # Report the final accuracy for this validation run.
    
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print('  Accuracy: {0:.5f}'.format(avg_val_accuracy))
    
    # Report the final f1 score for this validation run.
    
    avg_val_f1 = total_eval_f1 / len(validation_dataloader)
    print('  F1: {0:.5f}'.format(avg_val_f1))

    # Calculate the average loss over all of the batches.
    
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    
    
    # Measure how long the validation run took:
    
    validation_time = time.time() - t0
    
    print('  Validation Loss: {0:.5f}'.format(avg_val_loss))
    print('  Validation took: {:}'.format(validation_time))

    # Record all statistics from this epoch.
    
    training_stats.append(
        {
            'epoch': i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Val_F1' : avg_val_f1,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )


Training...
----- Epoch 1 / 10 -----
  Batch    50  of  2,539.    Elapsed: 18.526731967926025.
  Batch   100  of  2,539.    Elapsed: 35.77412509918213.
  Batch   150  of  2,539.    Elapsed: 53.16911959648132.
  Batch   200  of  2,539.    Elapsed: 70.50109362602234.
  Batch   250  of  2,539.    Elapsed: 87.71071934700012.
  Batch   300  of  2,539.    Elapsed: 104.87576293945312.
  Batch   350  of  2,539.    Elapsed: 122.0901551246643.
  Batch   400  of  2,539.    Elapsed: 139.23676371574402.
  Batch   450  of  2,539.    Elapsed: 156.49644017219543.
  Batch   500  of  2,539.    Elapsed: 173.66641092300415.
  Batch   550  of  2,539.    Elapsed: 190.8449137210846.
  Batch   600  of  2,539.    Elapsed: 208.10811042785645.
  Batch   650  of  2,539.    Elapsed: 225.41806554794312.
  Batch   700  of  2,539.    Elapsed: 242.5963010787964.
  Batch   750  of  2,539.    Elapsed: 259.8150019645691.
  Batch   800  of  2,539.    Elapsed: 277.03073954582214.
  Batch   850  of  2,539.    Elapsed: 294.

  Batch 1,800  of  2,539.    Elapsed: 620.6085796356201.
  Batch 1,850  of  2,539.    Elapsed: 637.896146774292.
  Batch 1,900  of  2,539.    Elapsed: 655.0860097408295.
  Batch 1,950  of  2,539.    Elapsed: 672.4487993717194.
  Batch 2,000  of  2,539.    Elapsed: 689.6684730052948.
  Batch 2,050  of  2,539.    Elapsed: 706.8717904090881.
  Batch 2,100  of  2,539.    Elapsed: 724.1131122112274.
  Batch 2,150  of  2,539.    Elapsed: 741.3255364894867.
  Batch 2,200  of  2,539.    Elapsed: 758.6409070491791.
  Batch 2,250  of  2,539.    Elapsed: 775.868989944458.
  Batch 2,300  of  2,539.    Elapsed: 793.0881237983704.
  Batch 2,350  of  2,539.    Elapsed: 810.3698906898499.
  Batch 2,400  of  2,539.    Elapsed: 827.5748028755188.
  Batch 2,450  of  2,539.    Elapsed: 844.870617389679.
  Batch 2,500  of  2,539.    Elapsed: 862.0970323085785.

  Average training loss: 1.60
  Training epoc h took: 875.2838418483734

Running Validation...
  Accuracy: 0.24055
  F1: 0.23421
  Validation Loss:

  Batch   850  of  2,539.    Elapsed: 292.8903269767761.
  Batch   900  of  2,539.    Elapsed: 310.0629482269287.
  Batch   950  of  2,539.    Elapsed: 327.2508850097656.
  Batch 1,000  of  2,539.    Elapsed: 344.4425871372223.
  Batch 1,050  of  2,539.    Elapsed: 361.65435814857483.
  Batch 1,100  of  2,539.    Elapsed: 378.8284478187561.
  Batch 1,150  of  2,539.    Elapsed: 396.03552770614624.
  Batch 1,200  of  2,539.    Elapsed: 413.23247718811035.
  Batch 1,250  of  2,539.    Elapsed: 430.46280241012573.
  Batch 1,300  of  2,539.    Elapsed: 447.60668778419495.
  Batch 1,350  of  2,539.    Elapsed: 464.85051107406616.
  Batch 1,400  of  2,539.    Elapsed: 482.0901689529419.
  Batch 1,450  of  2,539.    Elapsed: 499.2764096260071.
  Batch 1,500  of  2,539.    Elapsed: 516.4534933567047.
  Batch 1,550  of  2,539.    Elapsed: 533.6892232894897.
  Batch 1,600  of  2,539.    Elapsed: 550.8968191146851.
  Batch 1,650  of  2,539.    Elapsed: 568.084897518158.
  Batch 1,700  of  2,539. 

  Accuracy: 0.23307
  F1: 0.22777
  Validation Loss: 4.61566
  Validation took: 93.42043662071228

Training...
----- Epoch 9 / 10 -----
  Batch    50  of  2,539.    Elapsed: 17.22549057006836.
  Batch   100  of  2,539.    Elapsed: 34.431715965270996.
  Batch   150  of  2,539.    Elapsed: 51.620269775390625.
  Batch   200  of  2,539.    Elapsed: 68.78500461578369.
  Batch   250  of  2,539.    Elapsed: 86.03352928161621.
  Batch   300  of  2,539.    Elapsed: 103.21951913833618.
  Batch   350  of  2,539.    Elapsed: 120.41086220741272.
  Batch   400  of  2,539.    Elapsed: 137.58420300483704.
  Batch   450  of  2,539.    Elapsed: 154.7733075618744.
  Batch   500  of  2,539.    Elapsed: 171.9531626701355.
  Batch   550  of  2,539.    Elapsed: 189.2171356678009.
  Batch   600  of  2,539.    Elapsed: 206.51807045936584.
  Batch   650  of  2,539.    Elapsed: 223.72701120376587.
  Batch   700  of  2,539.    Elapsed: 241.0017695426941.
  Batch   750  of  2,539.    Elapsed: 258.16606426239014.
 

In [51]:
model.save_pretrained(f"./models/bert_{CUR_DATASET}_regexp_stopwords_{if_stopwords}_lemmatization_{if_lemmatize}_multiclass_SJ")