In [21]:
'''

    Script to process dataset, calculating TextBlob and VADER polarity and subjectivity scores

    Assumes dataset features contains columns titled "text" and "title" 

    Assumes input and output files are CSV

    Assumes that class indicator column is called 'label'
    TODO: If this needs to be a program that the prof/TA can use, then CLI args should be used for file names

'''


import os
import pandas as pd
from textblob import *
import nltk
from nltk.corpus import stopwords
import numpy as np
from sklearn import tree
import openpyxl
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import *




read_file = r"../data/Des_fake_news/ISOT.csv"
read_sheet = "valid"
write_file = r"../data/Des_fake_news/ISOT_PROCESSED.csv"

#data = pd.read_excel(read_file, read_sheet)
data = pd.read_csv(read_file)





data = data.dropna(axis=0)
data

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,2017-12-31 0:00,1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,2017-12-29 0:00,1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,2017-12-31 0:00,1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,2017-12-30 0:00,1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,2017-12-29 0:00,1
...,...,...,...,...,...
44872,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,42385,0
44873,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,42385,0
44874,Sunnistan: US and Allied Safe Zone Plan to Tak...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,42384,0
44875,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,42383,0


In [22]:
'''
    Natural language preprocessing

    Remove punctuation, make all words lowercase, and lemmatize
'''

from nltk.corpus import wordnet
from textblob import Word
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import string

'''
    NLTK has a model to tag words as adjectives, nouns, etc,
    but NLTK uses wordnet for lemmatization. wordnet only uses
    four possible tags, while NLTK returns tons of unique ones

    This function transforms NLTK tags to wordnet tags for lemmatization
'''
def nltk_tag_to_wordnet(tag: str) -> str:
    if tag[0] == "J":
        return wordnet.ADJ
    elif tag[0] == "V":
        return wordnet.VERB
    elif tag[0] == "N":
        return wordnet.NOUN
    elif tag[0] == "R":
        return wordnet.ADV
    else:
        return ""
    
'''
    Remove non-alphabetical characters and punctuation
'''
def keep_only_alphabetic(s: str) -> str:
    temp = "".join([" " if i in string.punctuation else i for i in s])
    return "".join([i for i in temp if (ord(i) <= 90 and ord(i) >= 65) or (ord(i) <= 122 and ord(i) >= 97) or i.isspace()])

'''
    Take a string of text, tokenize it, and return a list of lemmatized tokens
'''
def lemmatize_words(s: str) -> list[str]:
    lemmer = nltk.stem.WordNetLemmatizer()
    words = [i.lower() for i in word_tokenize(s)]       # tokenize and lowercase
    words = [i for i in words if i not in stopwords.words("english")]   # remove stopwords
    words = list(filter(lambda x: nltk_tag_to_wordnet(x[1]) !="", pos_tag(words)))  # remove invalid lemmatization words and tags
    words = [lemmer.lemmatize(i[0], nltk_tag_to_wordnet(i[1])) for i in words]  #  lemmatize words
    return words

'''
    Combine all functions above to pre-process strng
'''
def pre_process_text(text: str) -> str:
    s = keep_only_alphabetic(text)
    lemmatized = lemmatize_words(s)
    return " ".join(lemmatized)






In [23]:
data["text"] = data.apply(lambda x: keep_only_alphabetic(x["text"]), axis=1)
data["title"] = data.apply(lambda x: keep_only_alphabetic(x["title"]), axis=1)
data

Unnamed: 0,title,text,subject,date,label
0,As U S budget fight looms Republicans flip t...,WASHINGTON Reuters The head of a conservat...,politicsNews,2017-12-31 0:00,1
1,U S military to accept transgender recruits o...,WASHINGTON Reuters Transgender people will...,politicsNews,2017-12-29 0:00,1
2,Senior U S Republican senator Let Mr Muell...,WASHINGTON Reuters The special counsel inv...,politicsNews,2017-12-31 0:00,1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON Reuters Trump campaign adviser ...,politicsNews,2017-12-30 0:00,1
4,Trump wants Postal Service to charge much mor...,SEATTLE WASHINGTON Reuters President Donal...,politicsNews,2017-12-29 0:00,1
...,...,...,...,...,...
44872,McPain John McCain Furious That Iran Treated ...,st Century Wire says As WIRE reported earlier ...,Middle-east,42385,0
44873,JUSTICE Yahoo Settles E mail Privacy Class ac...,st Century Wire says It s a familiar theme Wh...,Middle-east,42385,0
44874,Sunnistan US and Allied Safe Zone Plan to Tak...,Patrick Henningsen st Century WireRemember wh...,Middle-east,42384,0
44875,How to Blow Million Al Jazeera America Fina...,st Century Wire says Al Jazeera America will g...,Middle-east,42383,0


In [24]:
# SKIP FOR WELFAKE

'''
def to_class(s):
    if s == "real":
        return 1
    return 0
'''


def liar_class_process(s: str) -> int:
    if s.lower() == "true":
        return 1
    elif s.lower() == "mostly-true":
        return 1
    else:
        return 0


In [25]:
#data["flag"] = data.apply(lambda x: to_class(x["flag"]), axis=1)
#data

if "liar" in read_file.lower():
    data["class"] = data.apply(lambda x: liar_class_process(x["class"]), axis=1)

data

Unnamed: 0,title,text,subject,date,label
0,As U S budget fight looms Republicans flip t...,WASHINGTON Reuters The head of a conservat...,politicsNews,2017-12-31 0:00,1
1,U S military to accept transgender recruits o...,WASHINGTON Reuters Transgender people will...,politicsNews,2017-12-29 0:00,1
2,Senior U S Republican senator Let Mr Muell...,WASHINGTON Reuters The special counsel inv...,politicsNews,2017-12-31 0:00,1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON Reuters Trump campaign adviser ...,politicsNews,2017-12-30 0:00,1
4,Trump wants Postal Service to charge much mor...,SEATTLE WASHINGTON Reuters President Donal...,politicsNews,2017-12-29 0:00,1
...,...,...,...,...,...
44872,McPain John McCain Furious That Iran Treated ...,st Century Wire says As WIRE reported earlier ...,Middle-east,42385,0
44873,JUSTICE Yahoo Settles E mail Privacy Class ac...,st Century Wire says It s a familiar theme Wh...,Middle-east,42385,0
44874,Sunnistan US and Allied Safe Zone Plan to Tak...,Patrick Henningsen st Century WireRemember wh...,Middle-east,42384,0
44875,How to Blow Million Al Jazeera America Fina...,st Century Wire says Al Jazeera America will g...,Middle-east,42383,0


In [26]:
data["text_tb_pol"] = data.apply(lambda x: TextBlob(x["text"]).polarity, axis=1)
data["text_tb_sub"] = data.apply(lambda x: TextBlob(x["text"]).subjectivity, axis=1)

data["title_tb_pol"] = data.apply(lambda x: TextBlob(x["title"]).polarity, axis=1)
data["title_tb_sub"] = data.apply(lambda x: TextBlob(x["title"]).subjectivity, axis=1)
data

Unnamed: 0,title,text,subject,date,label,text_tb_pol,text_tb_sub,title_tb_pol,title_tb_sub
0,As U S budget fight looms Republicans flip t...,WASHINGTON Reuters The head of a conservat...,politicsNews,2017-12-31 0:00,1,0.037083,0.410250,0.00,0.0
1,U S military to accept transgender recruits o...,WASHINGTON Reuters Transgender people will...,politicsNews,2017-12-29 0:00,1,0.044354,0.308401,-0.10,0.1
2,Senior U S Republican senator Let Mr Muell...,WASHINGTON Reuters The special counsel inv...,politicsNews,2017-12-31 0:00,1,0.115930,0.316798,0.00,0.0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON Reuters Trump campaign adviser ...,politicsNews,2017-12-30 0:00,1,0.035968,0.306569,0.00,0.0
4,Trump wants Postal Service to charge much mor...,SEATTLE WASHINGTON Reuters President Donal...,politicsNews,2017-12-29 0:00,1,0.031316,0.398166,0.35,0.3
...,...,...,...,...,...,...,...,...,...
44872,McPain John McCain Furious That Iran Treated ...,st Century Wire says As WIRE reported earlier ...,Middle-east,42385,0,0.085978,0.524124,0.00,0.0
44873,JUSTICE Yahoo Settles E mail Privacy Class ac...,st Century Wire says It s a familiar theme Wh...,Middle-east,42385,0,0.110000,0.610000,0.10,0.1
44874,Sunnistan US and Allied Safe Zone Plan to Tak...,Patrick Henningsen st Century WireRemember wh...,Middle-east,42384,0,0.073528,0.415687,0.50,0.5
44875,How to Blow Million Al Jazeera America Fina...,st Century Wire says Al Jazeera America will g...,Middle-east,42383,0,0.088566,0.426744,0.00,1.0


In [27]:
analyzer = SentimentIntensityAnalyzer()

data["title_vader_scores"] = data.apply(lambda x: analyzer.polarity_scores(x["title"]), axis=1)
data["title_vader_neg"] = data.apply(lambda x: x["title_vader_scores"]["neg"], axis=1)
data["title_vader_neu"] = data.apply(lambda x: x["title_vader_scores"]["neu"], axis=1)
data["title_vader_pos"] = data.apply(lambda x: x["title_vader_scores"]["pos"], axis=1)
data = data.drop(["title_vader_scores"], axis=1)

data["text_vader_scores"] = data.apply(lambda x: analyzer.polarity_scores(x["text"]), axis=1)
data["text_vader_neg"] = data.apply(lambda x: x["text_vader_scores"]["neg"], axis=1)
data["text_vader_neu"] = data.apply(lambda x: x["text_vader_scores"]["neu"], axis=1)
data["text_vader_pos"] = data.apply(lambda x: x["text_vader_scores"]["pos"], axis=1)
data = data.drop(["text_vader_scores"], axis=1)

In [29]:
data.to_csv(write_file)