### Setup

In [17]:
import string
import re
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import csv
import torch
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

from transformers import (BertForSequenceClassification, BertTokenizer,
                          RobertaForSequenceClassification, RobertaTokenizer,
                          XLMForSequenceClassification, XLMTokenizer,
                          XLNetForSequenceClassification, XLNetTokenizer,
                          DistilBertForSequenceClassification, DistilBertTokenizer,
                          AlbertForSequenceClassification, AlbertTokenizer,
                          AdamW, get_linear_schedule_with_warmup
                          )

import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
def encode_label(label:str):
    if label == 'true': return 0
    if label == 'mostly-true': return 1
    if label == 'barely-true': return 2
    if label == 'half-true': return 3
    if label == 'false': return 4
    if label == 'pants-fire': return 5
    return -1

def load_df(file_path:str, is_plus:bool):
    df = pd.read_csv(file_path, sep='\t', header=None, quoting=csv.QUOTE_NONE, usecols=[2,3,5,14,15]).dropna()
    df = df.rename(columns={2:'target', 3:'headline', 5:'speaker' , 14:'context', 15:'justification'})
    
    if(is_plus == False):
        df = df[['target', 'headline', 'speaker', 'context']]   
    df['target'] = df['target'].apply(encode_label)
    
    return df

### Loading Data...

In [8]:
liar_train = load_df('../data/liar_plus/train2.tsv', is_plus=False)
liar_train.head(3)

Unnamed: 0,target,headline,speaker,context
0,4,Says the Annies List political group supports ...,dwayne-bohac,a mailer
1,3,When did the decline of coal start? It started...,scott-surovell,a floor speech.
2,1,"""Hillary Clinton agrees with John McCain """"by ...",barack-obama,Denver


In [9]:
lp_train = load_df('../data/liar_plus/train2.tsv', is_plus=True)
lp_train.head(3)

Unnamed: 0,target,headline,speaker,context,justification
0,4,Says the Annies List political group supports ...,dwayne-bohac,a mailer,That's a premise that he fails to back up. Ann...
1,3,When did the decline of coal start? It started...,scott-surovell,a floor speech.,"""Surovell said the decline of coal """"started w..."
2,1,"""Hillary Clinton agrees with John McCain """"by ...",barack-obama,Denver,"""Obama said he would have voted against the am..."


In [61]:
pf = pd.read_csv('../data/politifact_plus.csv').drop(columns=['documented_time', 'author_score', 'summaries', 'article']).rename(columns={'when/where':'context'})
pf['target'] = pf['target'].apply(encode_label)
pf = pf[pf['target'] != -1]
pf.head(3)

Unnamed: 0,source,context,headline,target,speaker,src_true,src_mostly_true,src_half_true,src_mostly_false,src_false,src_pants_on_fire
0,Instagram posts,"stated on October 28, 2023 in a screenshot sha...",“Haaretz investigation reveals discrepancies i...,4,Madison Czopek,5.0,3.0,16.0,54.0,480.0,157.0
1,Scott Walker,"stated on May 30, 2023 in Interview:",“Wisconsin has historically … and I think larg...,2,Laura Schulte,26.0,45.0,39.0,41.0,44.0,11.0
2,Instagram posts,"stated on October 27, 2023 in a post:","“The airport in Salzburg, Austria, has a count...",4,Ciara O'Rourke,5.0,3.0,16.0,54.0,480.0,157.0
