Notebook for preprocessing datasets related to fake news detection

## Imports

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import re
from utils.text_processing_functions import *
from utils.dataset_loader import DatasetLoader
from moralstrength.moralstrength import estimate_morals
import nltk
import readability
from collections import Counter
import liwc
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
nltk.download('wordnet') #English
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /home/sergio/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/sergio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
parse, category_names = liwc.load_token_parser('dic/LIWCDictionary-en.dic')
liwc_dic = {key: 0 for key in category_names}
analyzer = SentimentIntensityAnalyzer()

In [5]:
## Load datasets
fakenewsnet_dataset = DatasetLoader('FakeNewsNet', 'fakenewsnet.csv')
fn_isot_dataset = DatasetLoader('FakeNewsISOT', 'fn_isot.csv')
fn_kaggle_dataset = DatasetLoader('FakeNewsKaggle', 'fn_kaggle.csv')
fakenews_amt_dataset = DatasetLoader('FakeNewsAMT', 'fakenews_amt.csv')
fn_random_political_dataset = DatasetLoader('FakeNewsRandomPolitical', 'randompolitical.csv')
fn_celebrity_dataset = DatasetLoader('FakeNewsCelebrity', 'celebrity.csv')
fn_buzfeed_political_dataset = DatasetLoader('FakeNewsBuzfeedPolitical', 'buzfeed_political.csv')

In [6]:
# datasets = [fn_kaggle_dataset, fn_isot_dataset]
# datasets = [fakenewsnet_dataset, fn_isot_dataset, fn_kaggle_dataset]
datasets = [fakenews_amt_dataset, fn_random_political_dataset, fn_celebrity_dataset, fn_buzfeed_political_dataset]

In [7]:
for dataset in datasets:
    print('-----Loading {dataset_name}-----'.format(dataset_name=dataset.name))
    dataset.load()
    dataset.lower()
    dataset.tokenize()

    df = dataset.df

    # extract readability features
    print('-----Extracting readability features-----')
    readability_features = df.apply(lambda x: readability.getmeasures(x['tokenized_text'], lang='en', merge=True), axis=1)
    readability_features = readability_features.apply(pd.Series).add_prefix('readability_')
    df = dataset.concat_dataframe_columns(readability_features)
    
    # extract sentiment features
    print('-----Extracting sentiment features-----')
    sentiments = df['text'].apply(analyzer.polarity_scores)
    sentiments = sentiments.apply(pd.Series).add_prefix('sentiment_')
    df = dataset.concat_dataframe_columns(sentiments)

    # extract liwc features
    print('-----Extracting liwc features-----')
    liwc_features = df.apply(lambda x: dict_update(liwc_dic, update_counter(Counter(category for token in x['tokenized_text'] for category in parse(token)), x['readability_words'])), axis=1)
    liwc_features = liwc_features.apply(pd.Series).add_prefix('liwc_')
    df = dataset.concat_dataframe_columns(liwc_features)

    # extract moral features
    print('-----Extracting moral features-----')
    morals = estimate_morals(df.text, process=True)
    morals.fillna(0, inplace=True)
    morals = morals.add_prefix('moral_')
    dataset.join_dataframe(morals)

    

    

-----Loading FakeNewsAMT-----
-----Extracting readability features-----
-----Extracting sentiment features-----
-----Extracting liwc features-----
-----Extracting moral features-----




-----Loading FakeNewsRandomPolitical-----
-----Extracting readability features-----
-----Extracting sentiment features-----
-----Extracting liwc features-----
-----Extracting moral features-----




-----Loading FakeNewsCelebrity-----
-----Extracting readability features-----
-----Extracting sentiment features-----
-----Extracting liwc features-----
-----Extracting moral features-----




-----Loading FakeNewsBuzfeedPolitical-----
-----Extracting readability features-----
-----Extracting sentiment features-----
-----Extracting liwc features-----
-----Extracting moral features-----




In [8]:
datasets[0].df.to_csv('fakenewsamt_wf.csv')

In [9]:
datasets[1].df.to_csv('fn_randompolitical_wf.csv')

In [10]:
datasets[2].df.to_csv('fn_celebrity_wf.csv')

In [11]:
datasets[3].df.to_csv('fn_buzfeed_wf.csv')