# Important Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import re

In [3]:
import nltk
nltk.download("stopwords")
nltk.download("punkt")  # Punkt Sentence Tokenizer
nltk.download("averaged_perceptron_tagger")  # Part of Speech Tagger
nltk.download("wordnet")  # a lexical database of English; useful for synonyms, hyponyms, etc.

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/marneusz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/marneusz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/marneusz/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/marneusz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

# Loading Data

In [5]:
CUR_DATASET = "CT-FAN"

In [6]:
data_dev = pd.read_csv(f'../data/{CUR_DATASET}/Task3_english_dev.csv')
data_train = pd.read_csv(f'../data/{CUR_DATASET}/Task3_english_training.csv')
data_test = pd.read_csv(f'../data/{CUR_DATASET}/English_data_test_release_with_rating.csv')

data_concat = pd.concat([data_train, data_dev])
data_concat.rename(columns={'our rating':'label'}, inplace=True)
data_test.rename(columns={'our rating':'label'}, inplace=True)
data_concat['label'] = data_concat['label'].apply(lambda x: x.lower())
data_test['label'] = data_test['label'].apply(lambda x: x.lower())

train_dataset = data_concat
test_dataset = data_test

In [7]:
train_dataset.head()

Unnamed: 0,public_id,text,title,label
0,5a228e0e,Distracted driving causes more deaths in Canad...,"You Can Be Fined $1,500 If Your Passenger Is U...",false
1,30c605a1,Missouri politicians have made statements afte...,Missouri lawmakers condemn Las Vegas shooting,partially false
2,c3dea290,Home Alone 2: Lost in New York is full of viol...,CBC Cuts Donald Trump's 'Home Alone 2' Cameo O...,partially false
3,f14e8eb6,But things took a turn for the worse when riot...,Obama’s Daughters Caught on Camera Burning US ...,false
4,faf024d6,It’s no secret that Epstein and Schiff share a...,Leaked Visitor Logs Reveal Schiff’s 78 Visits ...,false


In [8]:
train_dataset = train_dataset.fillna("null data")
test_dataset = test_dataset.fillna("null data")

In [9]:
train_dataset['label'] = train_dataset['label'].replace({
    'false' : 0,
    'partially false' : 1,
    'true' : 2,
    'other' : 3,
})

test_dataset['label'] = test_dataset['label'].replace({
    'false' : 0,
    'partially false' : 1,
    'true' : 2,
    'other' : 3,
})

In [10]:
train_labels = train_dataset["label"].values.astype(int)
test_labels = test_dataset["label"].values.astype(int)

# Some More EDA

In [11]:
np.unique(train_labels)

array([0, 1, 2, 3])

# Data Preprocessing and Data Preparation

### Removing stopwords

In [12]:
stop_words = stopwords.words('english')

In [13]:
if_stopwords = True

### Preprocessing

In [14]:
# from num2words import num2words

In [15]:
preprocessing_text_fn = {
    "no_punctuation": lambda txt: re.sub(r'[^\w\s]','', txt),
    "no_special_symbols": lambda txt: re.sub('[$,#,&]', '', txt),
    # "no_digits": lambda txt: re.sub('\d*', '', txt),
    "no_www": lambda txt: re.sub('w{3}', '', txt),
    "no_urls": lambda txt: re.sub('http\S+', '', txt),
    "no_spaces": lambda txt: re.sub('\s+', ' ', txt),
    "no_single_chars": lambda txt: re.sub(r'\s+[a-zA-Z]\s+', ' ', txt)
}

In [16]:
def preprocess_text(text, pipeline = preprocessing_text_fn):
    text = str(text)
    for fn in pipeline.keys():
        text = pipeline[fn](text)
        
    return text

In [17]:
train_dataset["title"] = train_dataset["title"].apply(preprocess_text)
train_dataset["text"] = train_dataset["text"].apply(preprocess_text)

test_dataset["title"] = test_dataset["title"].apply(preprocess_text)
test_dataset["text"] = test_dataset["text"].apply(preprocess_text)

In [18]:
if if_stopwords:
    for dataset in [train_dataset, test_dataset]:
        for col in ["title", "text"]:

            dataset[col] = dataset[col].str.lower().str.replace("’", "'")
            dataset[col] = dataset[col].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

### Lemmatization and Stemming

In [19]:
if_lemmatize = True

In [20]:
if if_lemmatize:
    
    import nltk
    nltk.download('wordnet')
    nltk.download('omw-1.4')
    from nltk.stem import WordNetLemmatizer
    
    wnl = WordNetLemmatizer()
    
    for dataset in [train_dataset, test_dataset]:
        for col in ["title", "text"]:
            dataset[col] = dataset[col].str.lower().str.replace("’", "'")
            dataset[col] = dataset[col].apply(lambda x: ' '.join([wnl.lemmatize(word) for word in word_tokenize(x)]))
    

[nltk_data] Downloading package wordnet to /home/marneusz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/marneusz/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [21]:
train_text = (train_dataset["title"] + " " + train_dataset["text"]).values
test_text = (test_dataset["title"] + " " + test_dataset["text"]).values

# Model Initialization

In [22]:
from tqdm import tqdm

In [23]:
import torch
if torch.cuda.is_available():    
    device = torch.device('cuda')    
print(device)

cuda


  from .autonotebook import tqdm as notebook_tqdm


In [24]:
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig, get_linear_schedule_with_warmup
from torch.optim import AdamW
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

In [25]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, precision_score, recall_score, balanced_accuracy_score

In [26]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)

In [27]:
np.unique(train_labels)

array([0, 1, 2, 3])

In [28]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', # Use the 124-layer, 1024-hidden, 16-heads, 340M parameters BERT model with an uncased vocab.
    num_labels = len(np.unique(train_labels)), 
    output_attentions = False, 
    output_hidden_states = False, 
)
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [29]:
max_len = 0
len_limit = 512
LIMIT = 100_000

indices = []
train_text_filtered = []

for i, text in enumerate(tqdm(train_text)):
    input_ids = tokenizer.encode(text, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))
    if len(input_ids) <= LIMIT:
        train_text_filtered.append(text)
        indices.append(i)
print('Max sentence length: ', max_len)

  0%|                                                                                                               | 0/1264 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (529 > 512). Running this sequence through the model will result in indexing errors
100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 1264/1264 [00:04<00:00, 262.33it/s]

Max sentence length:  4905





In [30]:
# labels_filtered = train_labels[indices]
# labels_filtered.shape


In [31]:
# https://www.kaggle.com/code/jeongwonkim10516/nlp-fake-news-with-bert-99-55-top1/notebook

def tokenize_map(sentence, labs='None'):
    
    """A function for tokenize all of the sentences and map the tokens to their word IDs."""
    
    input_ids = []
    attention_masks = []

    # For every sentence...
    
    for text in tqdm(sentence):
        #   "encode_plus" will:
        
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        #   (5) Pad or truncate the sentence to `max_length`
        #   (6) Create attention masks for [PAD] tokens.
        
        encoded_dict = tokenizer.encode_plus(
                            text,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            truncation='longest_first', # Activate and control truncation
                            max_length = len_limit,           # Max length according to our text data.
                            padding = 'max_length', # Pad & truncate all sentences.
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                       )

        # Add the encoded sentence to the id list. 
        
        input_ids.append(encoded_dict['input_ids'])

        # And its attention mask (simply differentiates padding from non-padding).
        
        attention_masks.append(encoded_dict['attention_mask'])
        
    # Convert the lists into tensors.
    
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    if labs != 'None': # Setting this for using this definition for both train and test data so labels won't be a problem in our outputs.
        labels = torch.tensor(labels)
        return input_ids, attention_masks
    
    return input_ids, attention_masks

In [32]:
train_text_filtered = np.array(train_text)
test_text_filtered = np.array(test_text)

In [33]:
train_text.shape, train_text_filtered.shape, test_text.shape, test_text_filtered.shape

((1264,), (1264,), (612,), (612,))

In [34]:
input_ids, attention_masks = tokenize_map(train_text_filtered)
train_labels = torch.tensor(train_labels)

val_input_ids, val_attention_masks = tokenize_map(test_text_filtered)
val_labels = torch.tensor(test_labels)

# test_input_ids, test_attention_masks= tokenize_map(test_text)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 1264/1264 [00:04<00:00, 270.59it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 612/612 [00:03<00:00, 199.32it/s]


In [35]:
train_labels

tensor([0, 1, 1,  ..., 0, 2, 2])

## Train and Validation Dataset

In [36]:
import transformers

In [37]:
seed = 10
transformers.set_seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7f0f0d387070>

In [38]:
type(attention_masks)

torch.Tensor

In [39]:
train_labels.shape, input_ids.shape

(torch.Size([1264]), torch.Size([1264, 512]))

In [40]:
train_dataset = TensorDataset(input_ids, attention_masks, train_labels)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, torch.Tensor(test_labels))

In [41]:
# Train DataLoader
batch_size = 4

train_dataloader = DataLoader(
            train_dataset,  
            sampler = RandomSampler(train_dataset), 
            batch_size = batch_size 
        )

# Validation DataLoader
validation_dataloader = DataLoader(
            val_dataset, 
            sampler = SequentialSampler(val_dataset), 
            batch_size = batch_size 
)

In [42]:
# # Test DataLoader

# test_data = TensorDataset(test_input_ids, test_attention_masks)
# test_sampler = SequentialSampler(test_data)
# test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

## Hyperparameters

In [43]:
from torch.optim import lr_scheduler

In [44]:
optimizer = AdamW(model.parameters(),
                  lr = 1e-5, # args.learning_rate
                  # eps = 1e-8 # args.adam_epsilon
            )

lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

In [45]:
num_epochs = 11
total_num_steps = len(train_dataloader) * num_epochs

## Metrics

In [46]:
def accuracy(predictions, labels):
    pred_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = labels.flatten()
    return balanced_accuracy_score(labels_flat, pred_flat)

def flat_f1_score(predictions, labels):
    pred_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = labels.flatten()
    
    
    return f1_score(labels_flat, pred_flat, zero_division=0, average="weighted")

## Training

In [47]:
import time

In [48]:
training_stats = []
validations_labels_ep = []
actual_labels_ep = []

total_t0 = time.time()
for i in range(0, num_epochs):
    print('')
    print('Training...')
    print('----- Epoch {:} / {:} -----'.format(i + 1, num_epochs))

    t0 = time.time()
    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):

        if step % 500 == 0 and not step == 0:
            elapsed = time.time() - t0
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device).to(torch.int64)
        b_input_mask = batch[1].to(device).to(torch.int64)
        b_labels = batch[2].to(device).to(torch.int64)
        
        model.zero_grad()        

        loss = model(b_input_ids, 
                     token_type_ids=None, 
                     attention_mask=b_input_mask,
                     labels=b_labels)[0]
        logits = model(b_input_ids, 
                       token_type_ids=None, 
                       attention_mask=b_input_mask,
                       labels=b_labels)[1]

        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_dataloader)            
    training_time = time.time() - t0

    print('')
    print('  Average training loss: {0:.2f}'.format(avg_train_loss))
    print('  Training epoc h took: {:}'.format(training_time))
    
    print('')
    print('Running Validation...')

    t0 = time.time()
    
    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    total_eval_f1 = 0
    nb_eval_steps = 0
    
    for batch in validation_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(torch.int64).to(device)
        
        with torch.no_grad():        

            loss = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)[0]

            logits = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)[1]
            
        # Accumulate the validation loss.
        
        total_eval_loss += loss.item()

        # Move logits and labels to CPU:
        
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and accumulate it over all batches:
        
        total_eval_accuracy += accuracy(logits, label_ids)
        total_eval_f1 += flat_f1_score(logits, label_ids)
        

    # Report the final accuracy for this validation run.
    
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print('  Accuracy: {0:.5f}'.format(avg_val_accuracy))
    
    # Report the final f1 score for this validation run.
    
    avg_val_f1 = total_eval_f1 / len(validation_dataloader)
    print('  F1: {0:.5f}'.format(avg_val_f1))

    # Calculate the average loss over all of the batches.
    
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    lr_scheduler.step()
    
    # Measure how long the validation run took:
    
    validation_time = time.time() - t0
    
    print('  Validation Loss: {0:.5f}'.format(avg_val_loss))
    print('  Validation took: {:}'.format(validation_time))

    # Record all statistics from this epoch.
    
    training_stats.append(
        {
            'epoch': i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Val_F1' : avg_val_f1,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )


Training...
----- Epoch 1 / 11 -----

  Average training loss: 1.23
  Training epoc h took: 110.94423055648804

Running Validation...






  Accuracy: 0.47113
  F1: 0.48204
  Validation Loss: 1.23594
  Validation took: 22.94356894493103

Training...
----- Epoch 2 / 11 -----

  Average training loss: 1.10
  Training epoc h took: 109.61510038375854

Running Validation...






  Accuracy: 0.47549
  F1: 0.49483
  Validation Loss: 1.18269
  Validation took: 22.678997039794922

Training...
----- Epoch 3 / 11 -----

  Average training loss: 0.99
  Training epoc h took: 109.81276822090149

Running Validation...






  Accuracy: 0.49074
  F1: 0.52326
  Validation Loss: 1.26696
  Validation took: 22.699379205703735

Training...
----- Epoch 4 / 11 -----

  Average training loss: 0.86
  Training epoc h took: 109.71295189857483

Running Validation...






  Accuracy: 0.49292
  F1: 0.51008
  Validation Loss: 1.64908
  Validation took: 22.688154935836792

Training...
----- Epoch 5 / 11 -----

  Average training loss: 0.72
  Training epoc h took: 109.52967119216919

Running Validation...






  Accuracy: 0.50436
  F1: 0.51135
  Validation Loss: 1.59589
  Validation took: 22.683653831481934

Training...
----- Epoch 6 / 11 -----

  Average training loss: 0.51
  Training epoc h took: 109.60635423660278

Running Validation...








  Accuracy: 0.51144
  F1: 0.55720
  Validation Loss: 1.64462
  Validation took: 22.71139883995056

Training...
----- Epoch 7 / 11 -----

  Average training loss: 0.45
  Training epoc h took: 109.80779004096985

Running Validation...








  Accuracy: 0.49292
  F1: 0.52946
  Validation Loss: 1.88139
  Validation took: 22.741271018981934

Training...
----- Epoch 8 / 11 -----

  Average training loss: 0.42
  Training epoc h took: 109.66261911392212

Running Validation...








  Accuracy: 0.48965
  F1: 0.52573
  Validation Loss: 1.95971
  Validation took: 22.745999813079834

Training...
----- Epoch 9 / 11 -----

  Average training loss: 0.38
  Training epoc h took: 110.31266617774963

Running Validation...








  Accuracy: 0.49510
  F1: 0.53230
  Validation Loss: 2.10573
  Validation took: 22.710118055343628

Training...
----- Epoch 10 / 11 -----

  Average training loss: 0.36
  Training epoc h took: 109.68272113800049

Running Validation...








  Accuracy: 0.47930
  F1: 0.51596
  Validation Loss: 2.23594
  Validation took: 22.713592290878296

Training...
----- Epoch 11 / 11 -----

  Average training loss: 0.32
  Training epoc h took: 109.55146503448486

Running Validation...








  Accuracy: 0.48911
  F1: 0.53315
  Validation Loss: 2.17186
  Validation took: 22.70931887626648




In [49]:
model.save_pretrained(f"./models/bert_{CUR_DATASET}_regexp_stopwords_{if_stopwords}_lemmatization_{if_lemmatize}_multiclass")