In [1]:
'''

    Script to process dataset, calculating TextBlob and VADER polarity and subjectivity scores

    Assumes dataset features contains columns titled "text" and "title" 

    Assumes input and output files are CSV

    Assumes that class indicator column is called 'label'
    TODO: If this needs to be a program that the prof/TA can use, then CLI args should be used for file names

'''


import os
import pandas as pd
from textblob import *
import nltk
from nltk.corpus import stopwords
import numpy as np
from sklearn import tree
import openpyxl
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import *




read_file = r"../data/Des_fake_news/WELFAKE.csv"
read_sheet = "valid"
write_file = r"../data/Des_fake_news/WELFAKE_PROCESSED.csv"

#data = pd.read_excel(read_file, read_sheet)
data = pd.read_csv(read_file)





data = data.dropna(axis=0)
data = data.sample(5000)
data

Unnamed: 0.1,Unnamed: 0,title,text,label
35498,35498,Fox News Just Compared Slave Owners To 9/11 V...,Nobody can accuse Fox News of being sane most ...,1
30850,30850,Rescuers search for Philippine storm victims a...,MANILA (Reuters) - Rescuers in the Philippines...,0
65840,65840,Obama takes on role of America's pitchman at G...,"HANOVER, Germany (Reuters) - U.S. President Ba...",0
65424,65424,ISRAEL WILL NAME New Train Station Near Wester...,Israel s transportation minister is pushing ah...,1
21786,21786,Germany’s Angela Merkel Makes Incredibly Naive...,Perhaps Merkel should have considered the seri...,1
...,...,...,...,...
1247,1247,Terrorist Jon Ritzheimer Uses Sobbing Daughte...,"Jon Ritzheimer, infamous hater of Muslims and ...",1
49972,49972,Bernie Sanders' American Dream is in Denmark,"Copenhagen, Denmark (CNN) Open a newspaper on ...",0
1163,1163,Connecticut Becomes First State To Boycott Ind...,WASHINGTON -- Connecticut Gov. Dan Malloy (D) ...,0
15545,15545,Betting on a Trump win or a North Korea H-bomb...,NEW YORK/WELLINGTON (Reuters) - Erik Duhaime i...,0


In [2]:
'''
    Natural language preprocessing

    Remove punctuation, make all words lowercase, and lemmatize
'''

from nltk.corpus import wordnet
from textblob import Word
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import string

'''
    NLTK has a model to tag words as adjectives, nouns, etc,
    but NLTK uses wordnet for lemmatization. wordnet only uses
    four possible tags, while NLTK returns tons of unique ones

    This function transforms NLTK tags to wordnet tags for lemmatization
'''
def nltk_tag_to_wordnet(tag: str) -> str:
    if tag[0] == "J":
        return wordnet.ADJ
    elif tag[0] == "V":
        return wordnet.VERB
    elif tag[0] == "N":
        return wordnet.NOUN
    elif tag[0] == "R":
        return wordnet.ADV
    else:
        return ""
    
'''
    Remove non-alphabetical characters and punctuation
'''
def keep_only_alphabetic(s: str) -> str:
    temp = "".join([" " if i in string.punctuation else i for i in s])
    return "".join([i for i in temp if (ord(i) <= 90 and ord(i) >= 65) or (ord(i) <= 122 and ord(i) >= 97) or i.isspace()])

'''
    Take a string of text, tokenize it, and return a list of lemmatized tokens
'''
def lemmatize_words(s: str) -> list[str]:
    lemmer = nltk.stem.WordNetLemmatizer()
    words = [i.lower() for i in word_tokenize(s)]       # tokenize and lowercase
    words = [i for i in words if i not in stopwords.words("english")]   # remove stopwords
    words = list(filter(lambda x: nltk_tag_to_wordnet(x[1]) !="", pos_tag(words)))  # remove invalid lemmatization words and tags
    words = [lemmer.lemmatize(i[0], nltk_tag_to_wordnet(i[1])) for i in words]  #  lemmatize words
    return words

'''
    Combine all functions above to pre-process strng
'''
def pre_process_text(text: str) -> str:
    s = keep_only_alphabetic(text)
    lemmatized = lemmatize_words(s)
    return " ".join(lemmatized)






In [3]:
data["text"] = data.apply(lambda x: keep_only_alphabetic(x["text"]), axis=1)
data["title"] = data.apply(lambda x: keep_only_alphabetic(x["title"]), axis=1)
data

Unnamed: 0.1,Unnamed: 0,title,text,label
35498,35498,Fox News Just Compared Slave Owners To Vict...,Nobody can accuse Fox News of being sane most ...,1
30850,30850,Rescuers search for Philippine storm victims a...,MANILA Reuters Rescuers in the Philippines...,0
65840,65840,Obama takes on role of America s pitchman at G...,HANOVER Germany Reuters U S President Ba...,0
65424,65424,ISRAEL WILL NAME New Train Station Near Wester...,Israel s transportation minister is pushing ah...,1
21786,21786,Germanys Angela Merkel Makes Incredibly Naive ...,Perhaps Merkel should have considered the seri...,1
...,...,...,...,...
1247,1247,Terrorist Jon Ritzheimer Uses Sobbing Daughte...,Jon Ritzheimer infamous hater of Muslims and ...,1
49972,49972,Bernie Sanders American Dream is in Denmark,Copenhagen Denmark CNN Open a newspaper on ...,0
1163,1163,Connecticut Becomes First State To Boycott Ind...,WASHINGTON Connecticut Gov Dan Malloy D ...,0
15545,15545,Betting on a Trump win or a North Korea H bomb...,NEW YORK WELLINGTON Reuters Erik Duhaime i...,0


In [4]:
# SKIP FOR WELFAKE

'''
def to_class(s):
    if s == "real":
        return 1
    return 0
'''


def liar_class_process(s: str) -> int:
    if s.lower() == "true":
        return 4
    elif s.lower() == "mostly-true":
        return 3
    elif s.lower() == "part-true":
        return 2
    elif s.lower() == "barely-true":
        return 2
    elif s.lower() == "pants-fire":
        return 1
    elif s.lower() == "false":
        return 0
    else:
        return 0


In [5]:
#data["flag"] = data.apply(lambda x: to_class(x["flag"]), axis=1)
#data

if "liar" in read_file.lower():
    data["class"] = data.apply(lambda x: liar_class_process(x["class"]), axis=1)

data

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
2,2,UNBELIEVABLE OBAMAS ATTORNEY GENERAL SAYS MOS...,Now most of the demonstrators gathered last ...,1
3,3,Bobby Jindal raised Hindu uses story of Chri...,A dozen politically active pastors came here f...,0
4,4,SATAN Russia unvelis an image of its terrify...,The RS Sarmat missile dubbed Satan will re...,1
5,5,About Time Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1
...,...,...,...,...
72129,72129,Russians steal research on Trump in hack of U ...,WASHINGTON Reuters Hackers believed to be ...,0
72130,72130,WATCH Giuliani Demands That Democrats Apolog...,You know because in fantasyland Republicans n...,1
72131,72131,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0
72132,72132,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY Reuters Donald Trumps combativ...,0


In [5]:
data["text_tb_pol"] = data.apply(lambda x: TextBlob(x["text"]).polarity, axis=1)
data["text_tb_sub"] = data.apply(lambda x: TextBlob(x["text"]).subjectivity, axis=1)

data["title_tb_pol"] = data.apply(lambda x: TextBlob(x["title"]).polarity, axis=1)
data["title_tb_sub"] = data.apply(lambda x: TextBlob(x["title"]).subjectivity, axis=1)
data

Unnamed: 0.1,Unnamed: 0,title,text,label,text_tb_pol,text_tb_sub,title_tb_pol,title_tb_sub
35498,35498,Fox News Just Compared Slave Owners To Vict...,Nobody can accuse Fox News of being sane most ...,1,-0.073710,0.456448,-0.333333,0.666667
30850,30850,Rescuers search for Philippine storm victims a...,MANILA Reuters Rescuers in the Philippines...,0,0.031739,0.337790,0.000000,0.000000
65840,65840,Obama takes on role of America s pitchman at G...,HANOVER Germany Reuters U S President Ba...,0,0.290539,0.514975,0.000000,0.000000
65424,65424,ISRAEL WILL NAME New Train Station Near Wester...,Israel s transportation minister is pushing ah...,1,0.081429,0.257857,0.078788,0.284848
21786,21786,Germanys Angela Merkel Makes Incredibly Naive ...,Perhaps Merkel should have considered the seri...,1,0.105782,0.422132,-0.300000,1.000000
...,...,...,...,...,...,...,...,...
1247,1247,Terrorist Jon Ritzheimer Uses Sobbing Daughte...,Jon Ritzheimer infamous hater of Muslims and ...,1,0.053139,0.473593,0.000000,0.000000
49972,49972,Bernie Sanders American Dream is in Denmark,Copenhagen Denmark CNN Open a newspaper on ...,0,0.084418,0.440525,0.000000,0.000000
1163,1163,Connecticut Becomes First State To Boycott Ind...,WASHINGTON Connecticut Gov Dan Malloy D ...,0,0.071160,0.467252,0.250000,0.333333
15545,15545,Betting on a Trump win or a North Korea H bomb...,NEW YORK WELLINGTON Reuters Erik Duhaime i...,0,0.064652,0.339281,0.800000,0.400000


In [7]:
analyzer = SentimentIntensityAnalyzer()

data["title_vader_scores"] = data.apply(lambda x: analyzer.polarity_scores(x["title"]), axis=1)
data["title_vader_neg"] = data.apply(lambda x: x["title_vader_scores"]["neg"], axis=1)
data["title_vader_neu"] = data.apply(lambda x: x["title_vader_scores"]["neu"], axis=1)
data["title_vader_pos"] = data.apply(lambda x: x["title_vader_scores"]["pos"], axis=1)
data = data.drop(["title_vader_scores"], axis=1)

data["text_vader_scores"] = data.apply(lambda x: analyzer.polarity_scores(x["text"]), axis=1)
data["text_vader_neg"] = data.apply(lambda x: x["text_vader_scores"]["neg"], axis=1)
data["text_vader_neu"] = data.apply(lambda x: x["text_vader_scores"]["neu"], axis=1)
data["text_vader_pos"] = data.apply(lambda x: x["text_vader_scores"]["pos"], axis=1)
data = data.drop(["text_vader_scores"], axis=1)

In [8]:
data.to_csv(write_file)