In [None]:
import spacy
import pandas as pd
import re
import string

from wordcloud import STOPWORDS

In [None]:
# ISOT Fake news dataset
dataset_fake = pd.read_csv("../data/ISOT/Fake.csv.zip")
dataset_fake['label'] = 0
dataset_true = pd.read_csv("../data/ISOT/True.csv.zip")
dataset_true['label'] = 1
ISOT_dataset = pd.concat([dataset_fake, dataset_true])

# Fake news dataset
FN_dataset = pd.read_csv("../data/FakeNews/train.csv.zip")
FN_dataset.dropna(inplace=True)

# There are missing apostrophes in contractions (e.g. "couldn't" -> "couldn t")

In [None]:
class DataPreprocessing():
    # python -m spacy download en_core_web_sm
    _nlp = spacy.load('en_core_web_sm')
    _punctuations= "?:!.,;"
    _stop_words = ["https", "ul", "-", " ", ".", "org", "wiki", "wikipedia", "will", "image", "s", "t", "m", "ve", "u", "said", \
                    "code", "span", "container", "imgur", "html", "jpg", "png", "jpeg", "nofollow", "quot",  "comment", \
                    "site", "SE", "tags", "comments", "moderators", "answers", "posts", "stackoverflow", "amp", "strike", \
                    "chat", "edit", "alt", "link", "exchange", "help", "votes", "noreferrer", "closed", "png", "h1", "new", \
                    "stackexchange", "class","users","questions", "topic", "meta", "stack", "href", "tag", "p", "em", "rel", \
                    "li", "user", "answer", "title", "h3", "moderator", "post", "vote", "blockquote", "strong","tagged", \
                    "question", "img", "page", '"', "\n"] + list(STOPWORDS)
        
    def process(self, data):
        # Lowercase
        data = data.lower()
        data = re.sub(' +', ' ', data)
        data = data.translate(str.maketrans('', '', string.punctuation + '"'))
        doc = self._nlp(data)

        # Tokenization and lemmatization 
        lemma_list = []
        for token in doc:
            lemma_list.append(token.lemma_)

        # Filter the stopword
        filtered_sentence = [] 
        for word in lemma_list:
            lexeme = self._nlp.vocab[word]
            if lexeme.is_stop == False and word not in self._stop_words:
                filtered_sentence.append(word) 

        # Remove punctuation
        for word in filtered_sentence:
            if word in self._punctuations:
                filtered_sentence.remove(word)
        
        return filtered_sentence

In [None]:
dataPreprocessing = DataPreprocessing()

In [None]:
# ISOT preprocessing
ISOT_res = ISOT_dataset['text'][0:99].apply(dataPreprocessing.process)
ISOT_res = pd.DataFrame(ISOT_res)
ISOT_res['label'] = ISOT_dataset['label'][0:99]

# Saving to file
ISOT_res.to_csv('../data/ISOT_Preprocessed/data.csv')

In [None]:
# Fake News preprocessing
FN_res = FN_dataset['text'][0:99].apply(dataPreprocessing.process)
FN_res = pd.DataFrame(FN_res)
FN_res['label'] = FN_dataset['label'][0:99]

# Saving to file
FN_res.to_csv('../data/FakeNews_Preprocessed/data.csv')