### Setup

In [1]:
import string
import re
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import csv
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

from transformers import (BertForSequenceClassification, BertTokenizer,
                          RobertaForSequenceClassification, RobertaTokenizer,
                          XLMForSequenceClassification, XLMTokenizer,
                          XLNetForSequenceClassification, XLNetTokenizer,
                          AlbertForSequenceClassification, AlbertTokenizer,
                          AdamW, get_linear_schedule_with_warmup
                          )

import matplotlib.pyplot as plt
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

def encode_label(label:str):
    if label == 'true': return 0
    if label == 'mostly-true': return 1
    if label == 'barely-true': return 2
    if label == 'half-true': return 3
    if label == 'false': return 4
    if label == 'pants-fire': return 5
    return -1

def load_df(file_path:str, is_plus:bool):
    df = pd.read_csv(file_path, sep='\t', header=None, quoting=csv.QUOTE_NONE, usecols=[2,3,5,14,15]).dropna()
    df = df.rename(columns={2:'target', 3:'headline', 5:'speaker' , 14:'context', 15:'justification'})
    
    if(is_plus == False):
        df = df[['target', 'headline', 'speaker', 'context']]   
    df['target'] = df['target'].apply(encode_label)
    
    return df

def clean_text(text):
    
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\n', '', text)
    text = re.sub(r'[^\w\s]', '', text)

    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in words if word not in stop_words]

    return ' '.join(words)

### Loading Data...

In [3]:
lp_train = load_df('../data/liar_plus/train2.tsv', is_plus=True)
lp_train.head(3)

Unnamed: 0,target,headline,speaker,context,justification
0,4,Says the Annies List political group supports ...,dwayne-bohac,a mailer,That's a premise that he fails to back up. Ann...
1,3,When did the decline of coal start? It started...,scott-surovell,a floor speech.,"""Surovell said the decline of coal """"started w..."
2,1,"""Hillary Clinton agrees with John McCain """"by ...",barack-obama,Denver,"""Obama said he would have voted against the am..."


In [4]:
pf = pd.read_csv('../data/clean_politifact.csv')
pf.head(3)

Unnamed: 0,source,context,target,speaker,documented_time,author_score,headline,article,summary,src_label
0,Instagram posts,"stated on October 28, 2023 in a screenshot sha...",4,Madison Czopek,"October 31, 2023",[ 5 3 16 54 473 152],haaretz investig reveal discrep israel report ...,viral oct social medium post claim israel lie ...,haaretz isra newspap said x claim report blata...,4
1,Scott Walker,"stated on May 30, 2023 in Interview:",2,Laura Schulte,"October 31, 2023",[26 45 39 41 44 11],wisconsin histor think larg continu blue state,wisconsin help swing presidenti vote donald tr...,although wisconsin vote democrat presidenti ca...,1
2,Instagram posts,"stated on October 27, 2023 in a post:",4,Ciara O'Rourke,"October 30, 2023",[ 5 3 16 54 473 152],airport salzburg austria counter peopl flew au...,social medium post poi encourag peopl unfortun...,social medium post poi encourag peopl unfortun...,4


### Topic Modeling

In [5]:
summaries = pf['summary'].tolist()

In [6]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.representation import PartOfSpeech
from bertopic.representation import MaximalMarginalRelevance

representation_model = {
   "Main": KeyBERTInspired(),
   "POS":  [PartOfSpeech("en_core_web_sm"), MaximalMarginalRelevance(diversity=.8)],
   "Key_High": [KeyBERTInspired(top_n_words=15), MaximalMarginalRelevance(diversity=.8)]
}

topic_model = BERTopic(language="english", min_topic_size=20, 
                       representation_model=representation_model, verbose=True)
topics, probs = topic_model.fit_transform(summaries)

2024-02-29 01:06:26,439 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 733/733 [00:18<00:00, 39.57it/s] 
2024-02-29 01:06:45,817 - BERTopic - Embedding - Completed ✓
2024-02-29 01:06:45,818 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-29 01:07:17,743 - BERTopic - Dimensionality - Completed ✓
2024-02-29 01:07:17,745 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-29 01:07:19,594 - BERTopic - Cluster - Completed ✓
2024-02-29 01:07:19,599 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-29 01:07:35,311 - BERTopic - Representation - Completed ✓


In [22]:
tf = topic_model.get_document_info(summaries)
tf.head(3)

Unnamed: 0,Document,Topic,Name,Representation,POS,Key_High,Representative_Docs,Top_n_words,Probability,Representative_document
0,haaretz isra newspap said x claim report blata...,81,81_gaza_palestinian_israelpalestin_israel,"[gaza, palestinian, israelpalestin, israel, is...","[palestinian, isra, attack, news, munayy, jebr...","[gaza, israelpalestin, israelgaza, isra, beiru...",[isra prime minist benjamin netanyahu warn dan...,gaza - palestinian - israelpalestin - israel -...,1.0,False
1,although wisconsin vote democrat presidenti ca...,3,3_wisconsin_governor_walker_republican,"[wisconsin, governor, walker, republican, stat...","[wisconsin, walker, state, elect, claim, year,...","[wisconsin, governor, walker, polit, democrat,...",[employ figur june announc wisconsin republica...,wisconsin - governor - walker - republican - s...,1.0,False
2,social medium post poi encourag peopl unfortun...,-1,-1_barack_obama_clinton_democrat,"[barack, obama, clinton, democrat, campaign, t...","[trump, presid, year, state, octob, claim, rep...","[barack, democrat, campaign, biden, congress, ...",[barack obama made direct case democrat nomin ...,barack - obama - clinton - democrat - campaign...,0.0,False


In [8]:
topic_model.visualize_topics()

In [9]:
from scipy.cluster import hierarchy as sch

# Hierarchical topics
linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(summaries, linkage_function=linkage_function)

topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

100%|██████████| 102/102 [00:10<00:00,  9.54it/s]


In [10]:
topics_per_class = topic_model.topics_per_class(summaries, classes=pf['target'])
topic_model.visualize_topics_per_class(topics_per_class, top_n_topics=10)

0it [00:00, ?it/s]

6it [00:15,  2.59s/it]


### Entity Extraction

In [11]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

In [13]:
ner_results = nlp(summaries[1].title())
ner_results

[{'entity': 'B-LOC',
  'score': 0.9977677,
  'index': 2,
  'word': 'Wisconsin',
  'start': 9,
  'end': 18},
 {'entity': 'B-MISC',
  'score': 0.8960398,
  'index': 5,
  'word': 'Democrat',
  'start': 24,
  'end': 32},
 {'entity': 'B-PER',
  'score': 0.98860335,
  'index': 7,
  'word': '##i',
  'start': 42,
  'end': 43},
 {'entity': 'I-PER',
  'score': 0.9991339,
  'index': 8,
  'word': 'Can',
  'start': 44,
  'end': 47},
 {'entity': 'B-MISC',
  'score': 0.99933416,
  'index': 11,
  'word': 'Republican',
  'start': 51,
  'end': 61},
 {'entity': 'B-PER',
  'score': 0.8062724,
  'index': 13,
  'word': '##i',
  'start': 71,
  'end': 72},
 {'entity': 'I-PER',
  'score': 0.9940282,
  'index': 14,
  'word': 'Can',
  'start': 73,
  'end': 76},
 {'entity': 'I-MISC',
  'score': 0.8278222,
  'index': 21,
  'word': 'His',
  'start': 95,
  'end': 98},
 {'entity': 'B-LOC',
  'score': 0.9518032,
  'index': 27,
  'word': 'Wisconsin',
  'start': 110,
  'end': 119},
 {'entity': 'B-MISC',
  'score': 0.997