### Setup

In [1]:
import string
import re
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import csv
import torch
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

from transformers import (BertForSequenceClassification, BertTokenizer,
                          RobertaForSequenceClassification, RobertaTokenizer,
                          XLMForSequenceClassification, XLMTokenizer,
                          XLNetForSequenceClassification, XLNetTokenizer,
                          DistilBertForSequenceClassification, DistilBertTokenizer,
                          AlbertForSequenceClassification, AlbertTokenizer,
                          AdamW, get_linear_schedule_with_warmup
                          )

import matplotlib.pyplot as plt
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def encode_label(label:str):
    if label == 'true': return 0
    if label == 'mostly-true': return 1
    if label == 'barely-true': return 2
    if label == 'half-true': return 3
    if label == 'false': return 4
    if label == 'pants-fire': return 5
    return -1

def load_df(file_path:str, is_plus:bool):
    df = pd.read_csv(file_path, sep='\t', header=None, quoting=csv.QUOTE_NONE, usecols=[2,3,5,14,15]).dropna()
    df = df.rename(columns={2:'target', 3:'headline', 5:'speaker' , 14:'context', 15:'justification'})
    
    if(is_plus == False):
        df = df[['target', 'headline', 'speaker', 'context']]   
    df['target'] = df['target'].apply(encode_label)
    
    return df

### Loading Data...

In [3]:
lp_train = load_df('../data/liar_plus/train2.tsv', is_plus=True)
lp_train.head(3)

Unnamed: 0,target,headline,speaker,context,justification
0,4,Says the Annies List political group supports ...,dwayne-bohac,a mailer,That's a premise that he fails to back up. Ann...
1,3,When did the decline of coal start? It started...,scott-surovell,a floor speech.,"""Surovell said the decline of coal """"started w..."
2,1,"""Hillary Clinton agrees with John McCain """"by ...",barack-obama,Denver,"""Obama said he would have voted against the am..."


In [4]:
pf = pd.read_csv('../data/politifact_plus.csv').drop(columns=['documented_time', 'author_score', 'summaries', 'article']).rename(columns={'when/where':'context'})
pf['target'] = pf['target'].apply(encode_label)
pf = pf[pf['target'] != -1]
pf.head(3)

Unnamed: 0,source,context,headline,target,speaker,src_true,src_mostly_true,src_half_true,src_mostly_false,src_false,src_pants_on_fire
0,Instagram posts,"stated on October 28, 2023 in a screenshot sha...",“Haaretz investigation reveals discrepancies i...,4,Madison Czopek,5.0,3.0,16.0,54.0,480.0,157.0
1,Scott Walker,"stated on May 30, 2023 in Interview:",“Wisconsin has historically … and I think larg...,2,Laura Schulte,26.0,45.0,39.0,41.0,44.0,11.0
2,Instagram posts,"stated on October 27, 2023 in a post:","“The airport in Salzburg, Austria, has a count...",4,Ciara O'Rourke,5.0,3.0,16.0,54.0,480.0,157.0


In [5]:
articles = pd.read_csv('../data/politifact_plus.csv')['article'].tolist()

### Topic Modeling

In [6]:
from bertopic import BERTopic

topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(articles)

2024-01-24 03:15:30,204 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 802/802 [01:08<00:00, 11.62it/s]
2024-01-24 03:16:43,859 - BERTopic - Embedding - Completed ✓
2024-01-24 03:16:43,861 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-24 03:17:16,485 - BERTopic - Dimensionality - Completed ✓
2024-01-24 03:17:16,486 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-24 03:20:38,579 - BERTopic - Cluster - Completed ✓
2024-01-24 03:20:38,586 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-24 03:21:03,676 - BERTopic - Representation - Completed ✓


In [7]:
topic_model.get_document_info(articles)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,A viral Oct. 28 social media post claimed that...,59,59_israeli_hamas_palestinian_video,"[israeli, hamas, palestinian, video, israel, g...",[Thousands of people have been killed or injur...,israeli - hamas - palestinian - video - israel...,1.000000,False
1,"In 2016, Wisconsin helped to swing the preside...",-1,-1_the_and_to_of,"[the, and, to, of, that, in, on, for, was, is]","[Ever since the Affordable Care Act passed, Re...",the - and - to - of - that - in - on - for - w...,0.936171,False
2,A social media post poised to encourage people...,-1,-1_the_and_to_of,"[the, and, to, of, that, in, on, for, was, is]","[Ever since the Affordable Care Act passed, Re...",the - and - to - of - that - in - on - for - w...,0.403023,False
3,The Gaza Health Ministry has said the Palestin...,59,59_israeli_hamas_palestinian_video,"[israeli, hamas, palestinian, video, israel, g...",[Thousands of people have been killed or injur...,israeli - hamas - palestinian - video - israel...,0.136235,False
4,Let’s clear the air. Do wind turbine component...,113,113_wind_energy_turbines_power,"[wind, energy, turbines, power, turbine, renew...",[Wind farms are a pillar of America’s climate ...,wind - energy - turbines - power - turbine - r...,1.000000,False
...,...,...,...,...,...,...,...,...
25650,"Huckabee says there are ""probably plenty"" of p...",54,54_religious_god_muslim_church,"[religious, god, muslim, church, religion, sha...","[President Barack Obama is a Christian, but th...",religious - god - muslim - church - religion -...,0.109933,False
25651,It's not entirely clear from the context wheth...,8,8_tax_income_taxes_cuts,"[tax, income, taxes, cuts, percent, top, taxpa...",[With a backdrop of leaked lewd comments and B...,tax - income - taxes - cuts - percent - top - ...,0.034884,False
25652,Statistics from the Bureau of Labor Statistics...,2,2_christie_jersey_new_christies,"[christie, jersey, new, christies, jerseys, go...",[New Jersey’s jobs picture is improving so muc...,christie - jersey - new - christies - jerseys ...,0.086440,False
25653,Corporate profits have been rising. The Commer...,266,266_income_median_inflationadjusted_household,"[income, median, inflationadjusted, household,...",[As Democratic presidential candidate Hillary ...,income - median - inflationadjusted - househol...,1.000000,False


### Representation Tuning

In [8]:
from bertopic.representation import PartOfSpeech

representation_model = PartOfSpeech("en_core_web_sm")
pos_model = BERTopic(representation_model=representation_model)
topics_pos, probs_pos = pos_model.fit_transform(articles)

In [9]:
pos_model.get_document_info(articles)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,A viral Oct. 28 social media post claimed that...,86,86_hamas_video_footage_false,"[hamas, video, footage, false, news, massacre,...",[Gaza’s casualties have piled up much faster t...,hamas - video - footage - false - news - massa...,0.838081,False
1,"In 2016, Wisconsin helped to swing the preside...",-1,-1_more_state_news_one,"[more, state, news, one, claim, post, presiden...","[If you’re a voter in the United States, you’v...",more - state - news - one - claim - post - pre...,0.000000,False
2,A social media post poised to encourage people...,-1,-1_more_state_news_one,"[more, state, news, one, claim, post, presiden...","[If you’re a voter in the United States, you’v...",more - state - news - one - claim - post - pre...,0.000000,False
3,The Gaza Health Ministry has said the Palestin...,86,86_hamas_video_footage_false,"[hamas, video, footage, false, news, massacre,...",[Gaza’s casualties have piled up much faster t...,hamas - video - footage - false - news - massa...,0.967798,False
4,Let’s clear the air. Do wind turbine component...,218,218_wind_turbines_energy_turbine,"[wind, turbines, energy, turbine, power, renew...",[Eli Bremer is one of seven Colorado Republica...,wind - turbines - energy - turbine - power - r...,1.000000,False
...,...,...,...,...,...,...,...,...
25650,"Huckabee says there are ""probably plenty"" of p...",16,16_religious_church_religion_mosque,"[religious, church, religion, mosque, prayer, ...","[President Barack Obama is a Christian, but a ...",religious - church - religion - mosque - praye...,1.000000,False
25651,It's not entirely clear from the context wheth...,7,7_tax_income_taxes_cuts,"[tax, income, taxes, cuts, percent, top, taxpa...","[Treasury Secretary Steve Mnuchin, one of the ...",tax - income - taxes - cuts - percent - top - ...,1.000000,False
25652,Statistics from the Bureau of Labor Statistics...,3,3_new_governor_tax_income,"[new, governor, tax, income, property, fiscal,...","[When Gov. Chris Christie took office, New Jer...",new - governor - tax - income - property - fis...,1.000000,False
25653,Corporate profits have been rising. The Commer...,24,24_income_wealth_inequality_top,"[income, wealth, inequality, top, bottom, medi...",[Democratic presidential candidate Bernie Sand...,income - wealth - inequality - top - bottom - ...,1.000000,False


In [10]:
from bertopic.representation import MaximalMarginalRelevance

representation_model = MaximalMarginalRelevance(diversity=0.3)
mmr_model = BERTopic(representation_model=representation_model)
topics_mmr, probs_mmr = mmr_model.fit_transform(articles)

In [11]:
mmr_model.get_document_info(articles)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,A viral Oct. 28 social media post claimed that...,89,89_hamas_palestinian_gaza_footage,"[hamas, palestinian, gaza, footage, israels, n...",[Gaza’s casualties have piled up much faster t...,hamas - palestinian - gaza - footage - israels...,0.989592,False
1,"In 2016, Wisconsin helped to swing the preside...",-1,-1_to_of_on_was,"[to, of, on, was, said, his, an, at, we, be]",[As he neared the end of his first 100 days in...,to - of - on - was - said - his - an - at - we...,0.000000,False
2,A social media post poised to encourage people...,-1,-1_to_of_on_was,"[to, of, on, was, said, his, an, at, we, be]",[As he neared the end of his first 100 days in...,to - of - on - was - said - his - an - at - we...,0.000000,False
3,The Gaza Health Ministry has said the Palestin...,89,89_hamas_palestinian_gaza_footage,"[hamas, palestinian, gaza, footage, israels, n...",[Gaza’s casualties have piled up much faster t...,hamas - palestinian - gaza - footage - israels...,1.000000,False
4,Let’s clear the air. Do wind turbine component...,127,127_turbines_turbine_grid_renewable,"[turbines, turbine, grid, renewable, electrici...",[The debate over Deepwater Wind’s plans to bui...,turbines - turbine - grid - renewable - electr...,1.000000,False
...,...,...,...,...,...,...,...,...
25650,"Huckabee says there are ""probably plenty"" of p...",84,84_church_faith_trinity_secular,"[church, faith, trinity, secular, muslims, oba...",[President Barack Obama has declared himself a...,church - faith - trinity - secular - muslims -...,0.746621,False
25651,It's not entirely clear from the context wheth...,4,4_taxes_cuts_percent_estate,"[taxes, cuts, percent, estate, gains, middle, ...",[With a backdrop of leaked lewd comments and B...,taxes - cuts - percent - estate - gains - midd...,1.000000,False
25652,Statistics from the Bureau of Labor Statistics...,-1,-1_to_of_on_was,"[to, of, on, was, said, his, an, at, we, be]",[As he neared the end of his first 100 days in...,to - of - on - was - said - his - an - at - we...,0.000000,False
25653,Corporate profits have been rising. The Commer...,19,19_wealth_inequality_sanders_median,"[wealth, inequality, sanders, median, walton, ...","[Now that the Democratic primary is over, Sen....",wealth - inequality - sanders - median - walto...,1.000000,False


### Entity Extraction

In [14]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

[{'entity': 'B-LOC', 'score': 0.9996146, 'index': 12, 'word': 'Israel', 'start': 48, 'end': 54}, {'entity': 'B-MISC', 'score': 0.99972373, 'index': 21, 'word': 'Israeli', 'start': 92, 'end': 99}, {'entity': 'B-ORG', 'score': 0.99891055, 'index': 23, 'word': 'Ha', 'start': 110, 'end': 112}, {'entity': 'I-ORG', 'score': 0.9976046, 'index': 24, 'word': '##aret', 'start': 112, 'end': 116}, {'entity': 'I-ORG', 'score': 0.99876606, 'index': 25, 'word': '##z', 'start': 116, 'end': 117}, {'entity': 'B-ORG', 'score': 0.99672794, 'index': 30, 'word': 'Ha', 'start': 132, 'end': 134}, {'entity': 'I-ORG', 'score': 0.99403775, 'index': 31, 'word': '##aret', 'start': 134, 'end': 138}, {'entity': 'I-ORG', 'score': 0.996802, 'index': 32, 'word': '##z', 'start': 138, 'end': 139}, {'entity': 'B-LOC', 'score': 0.99975675, 'index': 40, 'word': 'Israel', 'start': 179, 'end': 185}, {'entity': 'B-ORG', 'score': 0.99277335, 'index': 60, 'word': 'In', 'start': 268, 'end': 270}, {'entity': 'B-ORG', 'score': 0.83

In [18]:
ner_results = nlp(articles[0])
print(ner_results)

[{'entity': 'B-LOC', 'score': 0.9996146, 'index': 12, 'word': 'Israel', 'start': 48, 'end': 54}, {'entity': 'B-MISC', 'score': 0.99972373, 'index': 21, 'word': 'Israeli', 'start': 92, 'end': 99}, {'entity': 'B-ORG', 'score': 0.99891055, 'index': 23, 'word': 'Ha', 'start': 110, 'end': 112}, {'entity': 'I-ORG', 'score': 0.9976046, 'index': 24, 'word': '##aret', 'start': 112, 'end': 116}, {'entity': 'I-ORG', 'score': 0.99876606, 'index': 25, 'word': '##z', 'start': 116, 'end': 117}, {'entity': 'B-ORG', 'score': 0.99672794, 'index': 30, 'word': 'Ha', 'start': 132, 'end': 134}, {'entity': 'I-ORG', 'score': 0.99403775, 'index': 31, 'word': '##aret', 'start': 134, 'end': 138}, {'entity': 'I-ORG', 'score': 0.996802, 'index': 32, 'word': '##z', 'start': 138, 'end': 139}, {'entity': 'B-LOC', 'score': 0.99975675, 'index': 40, 'word': 'Israel', 'start': 179, 'end': 185}, {'entity': 'B-ORG', 'score': 0.99277335, 'index': 60, 'word': 'In', 'start': 268, 'end': 270}, {'entity': 'B-ORG', 'score': 0.83

In [20]:
ner_results

[{'entity': 'B-LOC',
  'score': 0.9996146,
  'index': 12,
  'word': 'Israel',
  'start': 48,
  'end': 54},
 {'entity': 'B-MISC',
  'score': 0.99972373,
  'index': 21,
  'word': 'Israeli',
  'start': 92,
  'end': 99},
 {'entity': 'B-ORG',
  'score': 0.99891055,
  'index': 23,
  'word': 'Ha',
  'start': 110,
  'end': 112},
 {'entity': 'I-ORG',
  'score': 0.9976046,
  'index': 24,
  'word': '##aret',
  'start': 112,
  'end': 116},
 {'entity': 'I-ORG',
  'score': 0.99876606,
  'index': 25,
  'word': '##z',
  'start': 116,
  'end': 117},
 {'entity': 'B-ORG',
  'score': 0.99672794,
  'index': 30,
  'word': 'Ha',
  'start': 132,
  'end': 134},
 {'entity': 'I-ORG',
  'score': 0.99403775,
  'index': 31,
  'word': '##aret',
  'start': 134,
  'end': 138},
 {'entity': 'I-ORG',
  'score': 0.996802,
  'index': 32,
  'word': '##z',
  'start': 138,
  'end': 139},
 {'entity': 'B-LOC',
  'score': 0.99975675,
  'index': 40,
  'word': 'Israel',
  'start': 179,
  'end': 185},
 {'entity': 'B-ORG',
  'score'