In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import torch

import re
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import SnowballStemmer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from transformers import (BertForSequenceClassification, BertTokenizer,
                          RobertaForSequenceClassification, RobertaTokenizer,
                          XLNetForSequenceClassification, XLNetTokenizer,
                          AlbertForSequenceClassification, AlbertTokenizer,
                          AdamW, get_linear_schedule_with_warmup
                          )

#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

if torch.cuda.is_available():    
    device = torch.device("cuda")
else:
    device = torch.device("cpu")




In [3]:
stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\n', '', text)
    text = re.sub(r'[^\w\s]', '', text)

    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in words if word not in stop_words]

    return ' '.join(words)

def encode_label(label:str):
    if label == 'true': return 0
    if label == 'mostly-true': return 1
    if label == 'barely-true': return 2
    if label == 'half-true': return 3
    if label == 'false': return 4
    if label == 'pants-fire': return 5
    return -1

def tokenize(X, y):
    input_ids = []
    attention_masks = []

    for txt in X.tolist():
        encoded_text = tokenizer.encode_plus(
                            txt,
                            add_special_tokens = True,
                            max_length = 100,
                            pad_to_max_length = True,
                            return_attention_mask = True,
                            return_tensors = 'pt'
                    )
        input_ids.append(encoded_text['input_ids'])
        attention_masks.append(encoded_text['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(torch.from_numpy(y.to_numpy()))
    
    return input_ids, attention_masks, labels

def accuracy(pred, actual):
    pred_flat = np.argmax(pred, axis=1).flatten()
    labels_flat = actual.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

### Load Data

In [4]:
df = pd.read_csv('../data/enriched_politifact.csv')
df.head(3)

Unnamed: 0,source,context,target,speaker,documented_time,author_score,headline,article,summary,src_label,topic
0,Instagram posts,"stated on October 28, 2023 in a screenshot sha...",4,Madison Czopek,"October 31, 2023",[ 5 3 16 54 473 152],haaretz investig reveal discrep israel report ...,viral oct social medium post claim israel lie ...,haaretz isra newspap said x claim report blata...,4,81_gaza_palestinian_israelpalestin_israel
1,Scott Walker,"stated on May 30, 2023 in Interview:",2,Laura Schulte,"October 31, 2023",[26 45 39 41 44 11],wisconsin histor think larg continu blue state,wisconsin help swing presidenti vote donald tr...,although wisconsin vote democrat presidenti ca...,1,3_wisconsin_governor_walker_republican
2,Instagram posts,"stated on October 27, 2023 in a post:",4,Ciara O'Rourke,"October 30, 2023",[ 5 3 16 54 473 152],airport salzburg austria counter peopl flew au...,social medium post poi encourag peopl unfortun...,social medium post poi encourag peopl unfortun...,4,-1_barack_obama_clinton_democrat


In [5]:
X, y = df, df['src_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_base_train, X_base_test = X_train['headline'], X_test['headline']
X_topic_train, X_topic_test = X_train['headline'] + ' ' + X_train['topic'], X_test['headline'] + ' ' + X_test['topic']
X_sumamry_train,  X_sumamry_test = X_train['headline'] + ' ' + X_train['summary'], X_test['headline'] + ' ' + X_test['summary']
X_full_train,  X_full_test= X_train['headline'] + ' ' + X_train['topic'] + ' ' + X_train['summary'] + X_train['source'], X_test['headline'] + ' ' + X_test['topic'] + ' ' + X_test['summary'] + X_test['source']

In [6]:
X_base_train.iloc[0]

'due war iraq american dead sever wound suffer serious health problem relat post traumat stress syndrom'

In [7]:
train_input_ids, train_attention_masks, train_labels = tokenize(X_base_train, y_train)
valid_input_ids, valid_attention_masks, valid_labels = tokenize(X_base_test, y_test)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [8]:
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
valid_dataset = TensorDataset(valid_input_ids, valid_attention_masks, valid_labels)

epochs = 3
batch_size = 8
train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size)
valid_dataloader = DataLoader(
            valid_dataset,
            sampler = SequentialSampler(valid_dataset),
            batch_size = batch_size)


### Baseline BERT Model

In [9]:
model = XLNetForSequenceClassification.from_pretrained(
    "xlnet-base-cased",
    num_labels = 6, 
    output_attentions = False,
    output_hidden_states = False)

desc = model.cuda()
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = len(train_dataloader) * epochs)

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summary.summary.bias', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [10]:
import random
import numpy as np

seed_val = 30
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []

for epoch_i in range(0, epochs):
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):

        if step % 100 == 0 and not step == 0:
            print('Batch {} of {}.'.format(step, len(train_dataloader)))

        batch = tuple(t.to(device) for t in batch)
        b_ids, b_mask, b_labels = batch

        model.zero_grad()        
        loss, logits = model(b_ids, 
                             token_type_ids=None, 
                             attention_mask=b_mask, 
                             labels=b_labels,
                             return_dict=False)
        
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader) 
               
    print("Avg. Train Loss: {0:.2f}".format(avg_train_loss))
    print("=====Validating====")

    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_ids, b_mask, b_labels = batch
        
        with torch.no_grad():        
            (loss, logits, _) = model(b_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_mask,
                                   labels=b_labels,
                                   return_dict=False)
        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_eval_accuracy += accuracy(logits, label_ids)
        
    avg_val_accuracy = total_eval_accuracy / len(valid_dataloader)
    print("Avg. Val Acc: {0:.2f}".format(avg_val_accuracy))

    avg_val_loss = total_eval_loss / len(valid_dataloader)
    print("Val Loss: {0:.2f}".format(avg_val_loss))

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy
        }
    )

print("")
print("Training complete!")

Batch 100 of 2346.
Batch 200 of 2346.
Batch 300 of 2346.
Batch 400 of 2346.
Batch 500 of 2346.
Batch 600 of 2346.
Batch 700 of 2346.
Batch 800 of 2346.
Batch 900 of 2346.
Batch 1000 of 2346.
Batch 1100 of 2346.
Batch 1200 of 2346.
Batch 1300 of 2346.
Batch 1400 of 2346.
Batch 1500 of 2346.
Batch 1600 of 2346.
Batch 1700 of 2346.
Batch 1800 of 2346.
Batch 1900 of 2346.
Batch 2000 of 2346.
Batch 2100 of 2346.
Batch 2200 of 2346.
Batch 2300 of 2346.
Avg. Train Loss: 1.57
=====Validating====
Avg. Val Acc: 0.41
Val Loss: 1.51
Batch 100 of 2346.
Batch 200 of 2346.
Batch 300 of 2346.
Batch 400 of 2346.
Batch 500 of 2346.
Batch 600 of 2346.
Batch 700 of 2346.
Batch 800 of 2346.
Batch 900 of 2346.
Batch 1000 of 2346.
Batch 1100 of 2346.
Batch 1200 of 2346.
Batch 1300 of 2346.
Batch 1400 of 2346.
Batch 1500 of 2346.
Batch 1600 of 2346.
Batch 1700 of 2346.
Batch 1800 of 2346.
Batch 1900 of 2346.
Batch 2000 of 2346.
Batch 2100 of 2346.
Batch 2200 of 2346.
Batch 2300 of 2346.
Avg. Train Loss: 1.45


### Factor Models

In [10]:
from clickbait import ClickbaitModel
from sentiment_log import SentimentModel
from spam import SpamModel
from source_reliable import SourceReliableModel

X_train, X_test, y_train, y_test = train_test_split(df['headline'], df['target'], test_size=.2, random_state=11)

clickM = ClickbaitModel()
sentiM = SentimentModel()
spamM = SpamModel()
srcM = SourceReliableModel()

sentiM.fit(X_train)
spamM.fit(X_train)
srcM.fit(X_train)


Training Accuracy: 0.952578
Testing Accuracy: 0.946250
              precision    recall  f1-score   support

           0       0.95      0.95      0.95     12823
           1       0.95      0.95      0.95     12777

    accuracy                           0.95     25600
   macro avg       0.95      0.95      0.95     25600
weighted avg       0.95      0.95      0.95     25600

Training Accuracy: 0.977538
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       373
           0       0.98      1.00      0.99     20063
           1       0.00      0.00      0.00        88

    accuracy                           0.98     20524
   macro avg       0.33      0.33      0.33     20524
weighted avg       0.96      0.98      0.97     20524

Training Accuracy: 0.590431
              precision    recall  f1-score   support

       false       0.00      0.00      0.00      8406
        true       0.59      1.00      0.74     12118

    accuracy     

In [13]:
acc_lst = [0.946, 0.976, 0.412, 0.589]
weight = [acc/sum(acc_lst) for acc in acc_lst]
clickbaitV = clickM.predict(X_test)[1] * weight[0]
sentiV = sentiM.predict(X_test)[1] * weight[1]
spamV = spamM.predict(X_test)[1] * weight[2]
sourceV = srcM.predict(X_test)[1] * weight[3]

In [28]:
train_clickbaitV = clickM.predict(X_train)[1] * weight[0]
train_sentiV = sentiM.predict(X_train)[1] * weight[1]
train_spamV = spamM.predict(X_train)[1] * weight[2]
train_sourceV = srcM.predict(X_train)[1] * weight[3]

In [36]:
veracity_train = pd.DataFrame({'clickbait':train_clickbaitV, 'sentiment': train_sentiV, 'spam': train_spamV, 'source': train_sourceV})
veracity_test = pd.DataFrame({'clickbait':clickbaitV, 'sentiment': sentiV, 'spam': spamV, 'source': sourceV})

In [37]:
clf = LogisticRegression(solver='liblinear')
clf.fit(veracity_train, y_train)
clf.score(veracity_test, y_test)

0.3047138047138047