In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import re

In [3]:
import nltk

In [4]:
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

In [5]:
import random

In [6]:
DATASETS = {
    "FakeNews": "",
    "ISOT": ""
}

CUR_DATASET = "ISOT"

In [7]:
dataset_fake = pd.read_csv(f"./data/{CUR_DATASET}/Fake.csv.zip")
dataset_fake['label'] = 0
dataset_true = pd.read_csv(f"./data/{CUR_DATASET}/True.csv.zip")
dataset_true['label'] = 1
ISOT_dataset = pd.concat([dataset_fake, dataset_true])

In [8]:
ISOT_dataset.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [9]:
ISOT_dataset.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

In [10]:
ISOT_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44898 entries, 0 to 21416
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   label    44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 2.1+ MB


## Data Preprocessing and Preparation

In [11]:
if_stopwords = True
if_lemmatize = True


### Regular Expressions

In [12]:
preprocessing_text_fn = {
    "no_punctuation": lambda txt: re.sub(r'[^\w\s]','', txt),
    "no_special_symbols": lambda txt: re.sub('[$,#,&]', '', txt),
    "no_digits": lambda txt: re.sub('\d*', '', txt),
    "no_www": lambda txt: re.sub('w{3}', '', txt),
    "no_urls": lambda txt: re.sub('http\S+', '', txt),
    "no_spaces": lambda txt: re.sub('\s+', ' ', txt),
    "no_single_chars": lambda txt: re.sub(r'\s+[a-zA-Z]\s+', '', txt)
}


In [13]:
def preprocess_text(text, pipeline = preprocessing_text_fn):
    text = str(text)
    for fn in pipeline.keys():
        text = pipeline[fn](text)
        
    return text

In [14]:
ISOT_dataset["title"] = ISOT_dataset["title"].apply(preprocess_text)
ISOT_dataset["text"] = ISOT_dataset["text"].apply(preprocess_text)
ISOT_dataset.head(10)

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Years...,Donald Trump just couldnwish all AmericansHapp...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,On Friday it was revealed that former Milwauke...,News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obamas Name ...,On Christmas day Donald Trump announced that h...,News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
5,Racist Alabama Cops Brutalize Black Boy While...,The number of cases of cops brutalizing and ki...,News,"December 25, 2017",0
6,Fresh Off The Golf Course Trump Lashes Out At...,Donald Trump spentgood portion of his day at h...,News,"December 23, 2017",0
7,Trump Said Some INSANELY Racist Stuff Inside ...,In the wake of yet another court decision that...,News,"December 23, 2017",0
8,Former CIA Director Slams Trump Over UN Bully...,Many people have raised the alarm regarding th...,News,"December 22, 2017",0
9,WATCH BrandNew ProTrump Ad Features So MuchKi...,Just when you might have thought wegetbreak fr...,News,"December 21, 2017",0


### Lemmatization

In [15]:
if if_lemmatize:
    
    import nltk
    nltk.download('wordnet')
    nltk.download('omw-1.4')
    from nltk.stem import WordNetLemmatizer
    
    wnl = WordNetLemmatizer()
    
    ISOT_dataset["title"] = ISOT_dataset["title"].str.lower().str.replace("’", "'")
    ISOT_dataset["title"] = ISOT_dataset["title"].apply(lambda x: ' '.join([wnl.lemmatize(word) for word in word_tokenize(x)]))
    ISOT_dataset["text"] = ISOT_dataset["text"].str.lower().str.replace("’", "'")
    ISOT_dataset["text"] = ISOT_dataset["text"].apply(lambda x: ' '.join([wnl.lemmatize(word) for word in word_tokenize(x)]))
    

[nltk_data] Downloading package wordnet to /home/marneusz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/marneusz/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Stopwords

In [16]:
stop_words = stopwords.words('english')

In [17]:
if if_stopwords:
    ISOT_dataset["title"] = ISOT_dataset["title"].str.lower().str.replace("’", "'")
    ISOT_dataset["title"] = ISOT_dataset["title"].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
    ISOT_dataset["text"] = ISOT_dataset["text"].str.lower().str.replace("’", "'")
    ISOT_dataset["text"] = ISOT_dataset["text"].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [18]:
ISOT_dataset = ISOT_dataset.sample(frac=1, random_state=10)

In [19]:
test_text = (ISOT_dataset['title'] + " " + ISOT_dataset['text']).values

In [20]:
test_labels = ISOT_dataset['label'].values

# Model Loading

In [21]:
from tqdm import tqdm

In [22]:
import torch
if torch.cuda.is_available():    
    device = torch.device('cuda')    
print(device)

cuda


  from .autonotebook import tqdm as notebook_tqdm


In [23]:
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig, get_linear_schedule_with_warmup
from torch.optim import AdamW
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

In [24]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, precision_score, recall_score

In [25]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)

In [26]:
model_name = "bert_regexp_stopwords_lemmatization"

In [27]:
model = BertForSequenceClassification.from_pretrained(
    f"./models/{model_name}/"
)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [28]:
from bert_utils import tokenize_map

In [29]:
input_ids, attention_masks, labels = tokenize_map(test_text, test_labels)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 44898/44898 [01:43<00:00, 433.65it/s]


In [30]:
import transformers

In [31]:
batch_size = 16
seed = 10
transformers.set_seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7f6fcd936f70>

In [32]:
test_dataset = TensorDataset(input_ids, attention_masks, labels)

In [33]:
test_dataloader = DataLoader(
            test_dataset, 
            sampler = SequentialSampler(test_dataset), 
            batch_size = batch_size 
)

### Metrics

In [34]:
def accuracy(predictions, labels):
    pred_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = labels.flatten()
    
    return accuracy_score(labels_flat, pred_flat)

def flat_f1_score(predictions, labels):
    pred_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = labels.flatten()
    

    return f1_score(labels_flat, pred_flat, zero_division=0)

### Evaluation

In [35]:
import time

In [36]:
t0 = time.time()

model.eval()

total_eval_accuracy = 0
total_eval_loss = 0
total_eval_f1 = 0
nb_eval_steps = 0

for batch in tqdm(test_dataloader):

    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    with torch.no_grad():        

        loss = model(b_input_ids, 
                     token_type_ids=None, 
                     attention_mask=b_input_mask,
                     labels=b_labels)[0]

        logits = model(b_input_ids, 
                       token_type_ids=None, 
                       attention_mask=b_input_mask,
                       labels=b_labels)[1]

    # Accumulate the validation loss.

    total_eval_loss += loss.item()

    # Move logits and labels to CPU:

    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    # Calculate the accuracy for this batch of test sentences, and accumulate it over all batches:

    total_eval_accuracy += accuracy(logits, label_ids)
    total_eval_f1 += flat_f1_score(logits, label_ids)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2807/2807 [26:44<00:00,  1.75it/s]


In [37]:
# Report the final accuracy for this validation run.

avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
print('  Accuracy: {0:.5f}'.format(avg_val_accuracy))

# Report the final f1 score for this validation run.

avg_val_f1 = total_eval_f1 / len(test_dataloader)
print('  F1: {0:.5f}'.format(avg_val_f1))

# Calculate the average loss over all of the batches.

avg_val_loss = total_eval_loss / len(test_dataloader)



# Measure how long the validation run took:

test_time = time.time() - t0

print('  Validation Loss: {0:.5f}'.format(avg_val_loss))
print('  Validation took: {:}'.format(test_time))

  Accuracy: 0.41517
  F1: 0.57178
  Validation Loss: 5.92430
  Validation took: 1604.4437456130981
