In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import re

In [3]:
import nltk

In [4]:
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

In [5]:
import random

In [6]:
CUR_DATASET = "CT-FAN"

In [7]:
data_dev = pd.read_csv(f'../data/{CUR_DATASET}/Task3_english_dev.csv')
data_train = pd.read_csv(f'../data/{CUR_DATASET}/Task3_english_training.csv')

test_dataset = pd.concat([data_train, data_dev])
# test_dataset = pd.read_csv(f'../data/{CUR_DATASET}/English_data_test_release_with_rating.csv')

In [8]:
test_dataset.head()

Unnamed: 0,public_id,text,title,our rating
0,5a228e0e,Distracted driving causes more deaths in Canad...,"You Can Be Fined $1,500 If Your Passenger Is U...",FALSE
1,30c605a1,Missouri politicians have made statements afte...,Missouri lawmakers condemn Las Vegas shooting,partially false
2,c3dea290,Home Alone 2: Lost in New York is full of viol...,CBC Cuts Donald Trump's 'Home Alone 2' Cameo O...,partially false
3,f14e8eb6,But things took a turn for the worse when riot...,Obama’s Daughters Caught on Camera Burning US ...,FALSE
4,faf024d6,It’s no secret that Epstein and Schiff share a...,Leaked Visitor Logs Reveal Schiff’s 78 Visits ...,FALSE


In [9]:
test_dataset.rename(columns={'our rating':'label'}, inplace=True)
test_dataset['label'] = test_dataset['label'].apply(lambda x: x.lower())

In [10]:
test_dataset['label'] = test_dataset['label'].replace({
    'false' : 0,
    'partially false' : 1,
    'true' : 2,
    'other' : 3,
})

In [11]:
test_labels = test_dataset["label"].values.astype(int)

## Data Preprocessing and Preparation

In [12]:
if_stopwords = True
if_lemmatize = True


In [13]:
stop_words = stopwords.words('english')

### Regular Expressions

In [14]:
preprocessing_text_fn = {
    "no_punctuation": lambda txt: re.sub(r'[^\w\s]',' ', txt),
    "no_special_symbols": lambda txt: re.sub('[$,#,&]', '', txt),
    # "no_digits": lambda txt: re.sub('\d*', '', txt),
    "no_www": lambda txt: re.sub('w{3}', '', txt),
    "no_urls": lambda txt: re.sub('http\S+', '', txt),
    "no_spaces": lambda txt: re.sub('\s+', ' ', txt),
    "no_single_chars": lambda txt: re.sub(r'\s+[a-zA-Z]\s+', ' ', txt)
}


In [15]:
def preprocess_text(text, pipeline = preprocessing_text_fn):
    text = str(text)
    for fn in pipeline.keys():
        text = pipeline[fn](text)
        
    return text

In [16]:
test_dataset["title"] = test_dataset["title"].apply(preprocess_text)
test_dataset["text"] = test_dataset["text"].apply(preprocess_text)

In [17]:
if if_stopwords:
    for dataset in [test_dataset]:
        for col in ["title", "text"]:

            dataset[col] = dataset[col].str.lower().str.replace("’", "'")
            dataset[col] = dataset[col].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

### Lemmatization

In [18]:
if if_lemmatize:
    
    import nltk
    nltk.download('wordnet')
    nltk.download('omw-1.4')
    from nltk.stem import WordNetLemmatizer
    
    wnl = WordNetLemmatizer()
    
    for dataset in [test_dataset]:
        for col in ["title", "text"]:
            dataset[col] = dataset[col].str.lower().str.replace("’", "'")
            dataset[col] = dataset[col].apply(lambda x: ' '.join([wnl.lemmatize(word) for word in word_tokenize(x)]))

[nltk_data] Downloading package wordnet to /home/marneusz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/marneusz/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [19]:
test_text = (test_dataset["title"] + " " + test_dataset["text"]).values

# Model Loading

In [20]:
from tqdm import tqdm

In [21]:
import torch
if torch.cuda.is_available():    
    device = torch.device('cuda')    
print(device)

  from .autonotebook import tqdm as notebook_tqdm


cuda


In [22]:
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig, get_linear_schedule_with_warmup
from torch.optim import AdamW
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

In [23]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, precision_score, recall_score, balanced_accuracy_score

In [24]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)

In [25]:
model_name = f"bert_{CUR_DATASET}_regexp_stopwords_{if_stopwords}_lemmatization_{if_lemmatize}_multiclass"

In [26]:
model = BertForSequenceClassification.from_pretrained(
    f"./models/{model_name}/"
)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [27]:
from bert_utils import tokenize_map

In [28]:
input_ids, attention_masks, labels = tokenize_map(test_text, test_labels)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 1264/1264 [00:04<00:00, 265.29it/s]


In [29]:
import transformers

In [30]:
batch_size = 16
seed = 10
transformers.set_seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7f1a46792ef0>

In [31]:
test_dataset = TensorDataset(input_ids, attention_masks, labels)

In [32]:
test_dataloader = DataLoader(
            test_dataset, 
            sampler = SequentialSampler(test_dataset), 
            batch_size = batch_size 
)

### Metrics

In [33]:
def accuracy(predictions, labels):
    pred_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = labels.flatten()
    
    return balanced_accuracy_score(labels_flat, pred_flat)

def flat_f1_score(predictions, labels):
    pred_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = labels.flatten()
    

    return f1_score(labels_flat, pred_flat, zero_division=0, average="weighted")

def mae(predictions, labels):
    pred_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.mean(np.abs(pred_flat - labels_flat))

def mse(predictions, labels):
    pred_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.mean((pred_flat - labels_flat) ** 2)

### Evaluation

In [34]:
import time

In [35]:
t0 = time.time()

model.eval()

total_eval_accuracy = 0
total_eval_loss = 0
total_eval_f1 = 0
total_eval_mae = 0
total_eval_mse = 0
nb_eval_steps = 0

for batch in tqdm(test_dataloader):

    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    with torch.no_grad():        

        loss = model(b_input_ids, 
                     token_type_ids=None, 
                     attention_mask=b_input_mask,
                     labels=b_labels)[0]

        logits = model(b_input_ids, 
                       token_type_ids=None, 
                       attention_mask=b_input_mask,
                       labels=b_labels)[1]

    # Accumulate the validation loss.

    total_eval_loss += loss.item()

    # Move logits and labels to CPU:

    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    # Calculate the accuracy for this batch of test sentences, and accumulate it over all batches:

    total_eval_accuracy += accuracy(logits, label_ids)
    total_eval_f1 += flat_f1_score(logits, label_ids)
    total_eval_mae += mae(logits, label_ids)
    total_eval_mse += mse(logits, label_ids)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 79/79 [00:46<00:00,  1.71it/s]


In [37]:
# Report the final accuracy for this validation run.

avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
print('  Accuracy: {0:.5f}'.format(avg_val_accuracy))

# Report the final f1 score for this validation run.

avg_val_f1 = total_eval_f1 / len(test_dataloader)
print('  F1: {0:.5f}'.format(avg_val_f1))

avg_val_mae = total_eval_mae / len(test_dataloader)
print('  MAE: {0:.5f}'.format(avg_val_mae))

avg_val_mse = total_eval_mse / len(test_dataloader)
print('  MSE: {0:.5f}'.format(avg_val_mse))

# Calculate the average loss over all of the batches.

avg_val_loss = total_eval_loss / len(test_dataloader)



# Measure how long the validation run took:

test_time = time.time() - t0

print('  Test Loss: {0:.5f}'.format(avg_val_loss))
print('  Test took: {:}'.format(test_time))

  Accuracy: 0.83937
  F1: 0.88907
  MAE: 0.15348
  MSE: 0.26899
  Test Loss: 0.35574
  Test took: 63.109795331954956
