Notebook for preprocessing datasets related to fake news detection

## Imports

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import re
from utils.dataset_loader import DatasetLoader
from moralstrength.moralstrength import estimate_morals
import nltk
import readability
from collections import Counter
import liwc
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from nrclex import NRCLex

In [2]:
def dict_update(x, z):
    y = x.copy()
    y.update(z)
    return y

def update_counter(coun, length):
    for item, _ in coun.items():
        coun[item] /= length
    return coun

In [3]:
nltk.download('wordnet') #English
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /home/sergio/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/sergio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
parse, category_names = liwc.load_token_parser('dic/LIWCDictionary-en.dic')
liwc_dic = {key: 0 for key in category_names}
analyzer = SentimentIntensityAnalyzer()


In [5]:
## Load datasets
fakenewsnet_dataset = DatasetLoader('FakeNewsNet', './processed/fn_fakenewsnet.csv')
fn_isot_dataset = DatasetLoader('FakeNewsISOT', './processed/fn_isot.csv')
fn_kaggle_dataset = DatasetLoader('FakeNewsKaggle', './processed/fn_kaggle.csv')
fakenews_amt_dataset = DatasetLoader('FakeNewsAMT', './processed/fn_amt.csv')
fn_random_political_dataset = DatasetLoader('FakeNewsRandomPolitical', './processed/fn_randompolitical.csv')
fn_celebrity_dataset = DatasetLoader('FakeNewsCelebrity', './processed/fn_celebrity.csv')
fn_buzfeed_political_dataset = DatasetLoader('FakeNewsBuzfeedPolitical', './processed/fn_buzfeed_political.csv')
fn_politfalse = DatasetLoader('FakeNewsPolitFalse', './processed/fn_politfalse.csv')
fn_satirical = DatasetLoader('FakeNewsSatirical', './processed/fn_satirical.csv')


In [6]:
# datasets = [fn_kaggle_dataset, fn_isot_dataset]
# datasets = [fakenewsnet_dataset, fn_isot_dataset, fn_kaggle_dataset]
datasets = [fakenewsnet_dataset, fn_isot_dataset, fn_kaggle_dataset, fakenews_amt_dataset, fn_random_political_dataset, fn_celebrity_dataset, fn_buzfeed_political_dataset, fn_politfalse, fn_satirical]


In [7]:
for dataset in datasets:
    print('-----Loading {dataset_name}-----'.format(dataset_name=dataset.name))
    dataset.load()
    dataset.lower()
    dataset.tokenize()

    df = dataset.df

    # extract readability features
    print('-----Extracting readability features-----')
    readability_features = df.apply(lambda x: readability.getmeasures(x['tokenized_text'], lang='en', merge=True), axis=1)
    readability_features = readability_features.apply(pd.Series).add_prefix('readability_')
    df = dataset.concat_dataframe_columns(readability_features)
    
    # extract sentiment features
    print('-----Extracting sentiment features-----')
    sentiments = df['text'].apply(analyzer.polarity_scores)
    sentiments = sentiments.apply(pd.Series).add_prefix('sentiment_')
    df = dataset.concat_dataframe_columns(sentiments)

    # extract emotions features
    print('-----Extracting emotions features-----')
    emotions_features = df.apply(lambda x: NRCLex(x.text).affect_frequencies, axis=1)
    emotions_features = emotions_features.apply(pd.Series).add_prefix('emotion_')
    df = dataset.concat_dataframe_columns(emotions_features)
    
    # extract liwc features
    print('-----Extracting liwc features-----')
    liwc_features = df.apply(lambda x: dict_update(liwc_dic, update_counter(Counter(category for token in x['tokenized_text'] for category in parse(token)), x['readability_words'])), axis=1)
    liwc_features = liwc_features.apply(pd.Series).add_prefix('liwc_')
    df = dataset.concat_dataframe_columns(liwc_features)

    # extract moral features
    print('-----Extracting moral features-----')
    morals = estimate_morals(df.text, process=True)
    morals.fillna(0, inplace=True)
    morals = morals.add_prefix('moral_')
    dataset.join_dataframe(morals)

    # extract subjectivity
    print('-----Extracting subjectivity-----')
    dataset.df['subjectivity'] = df['text'].apply(lambda x: TextBlob(x).sentiment.subjectivity) 

    # save dataset
    print('-----Saving {dataset_name}-----'.format(dataset_name=dataset.name))
    dataset.df.to_csv('./wf/{dataset_name}_wf.csv'.format(dataset_name=dataset.name))

-----Loading FakeNewsMcintire-----
-----Extracting readability features-----
-----Extracting sentiment features-----
-----Extracting emotions features-----
-----Extracting liwc features-----
-----Extracting moral features-----




-----Extracting subjectivity-----
-----Saving FakeNewsMcintire-----
