# Imports

In [34]:
# imports
import os
import pandas as pd
import numpy as np

import sys

from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
sia = SIA()

import gensim

import nltk
#file for punkt splitter
nltk.download('punkt');
#file for vader sentiment
nltk.download('vader_lexicon');

#wordnet lemmatization
nltk.download('wordnet')
#more for preprocessing
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
stemmer = PorterStemmer()

import pyLDAvis.gensim

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"]=20,20
%matplotlib inline

import pickle


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Masking, Embedding

import glob
import json

import warnings; warnings.simplefilter('ignore')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emiel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\emiel\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\emiel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


<br><br>

# Topic modelling

In [15]:
%%time

num_topics = 15

dictionary = gensim.corpora.Dictionary.load(os.path.join("gensim", "dictionary"))

bigram = gensim.models.phrases.Phrases.load(os.path.join('gensim', 'bigram.pkl'))
trigram = gensim.models.phrases.Phrases.load(os.path.join('gensim', 'trigram.pkl'))

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

tfidf_model = gensim.models.TfidfModel.load(os.path.join('gensim', 'tfidf_model.pkl'))

lda_tfidf_model = gensim.models.LdaMulticore.load(os.path.join("gensim", "02", "lda_model"))

Wall time: 1min 15s


In [16]:
def stem_lemmatize(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def make_ngrams(text):
    return trigram_mod[bigram_mod[text]]

def topic_preprocess(text):
    #all the nice preprocessing without the bigrams and trigrams
    output = []
    
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            output.append(stem_lemmatize(token))
            
    output = make_ngrams(output)
    
    output = dictionary.doc2bow(output)
    
    return tfidf_model[output]

text = 'WASHINGTON (Reuters) - The special counsel investigation of links between Russia and President Trump’s 2016 election campaign should continue without interference in 2018, despite calls from some Trump administration allies and Republican lawmakers to shut it down, a prominent Republican senator said on Sunday. Lindsey Graham, who serves on the Senate armed forces and judiciary committees, said Department of Justice Special Counsel Robert Mueller needs to carry on with his Russia investigation without political interference. “This investigation will go forward. It will be an investigation conducted without political influence,” Graham said on CBS’s Face the Nation news program. “And we all need to let Mr. Mueller do his job. I think he’s the right guy at the right time.”  The question of how Russia may have interfered in the election, and how Trump’s campaign may have had links with or co-ordinated any such effort, has loomed over the White House since Trump took office in January. It shows no sign of receding as Trump prepares for his second year in power, despite intensified rhetoric from some Trump allies in recent weeks accusing Mueller’s team of bias against the Republican president. Trump himself seemed to undercut his supporters in an interview last week with the New York Times in which he said he expected Mueller was “going to be fair.”    Russia’s role in the election and the question of possible links to the Trump campaign are the focus of multiple inquiries in Washington. Three committees of the Senate and the House of Representatives are investigating, as well as Mueller, whose team in May took over an earlier probe launched by the U.S. Federal Bureau of Investigation (FBI). Several members of the Trump campaign and administration have been convicted or indicted in the investigation.  Trump and his allies deny any collusion with Russia during the campaign, and the Kremlin has denied meddling in the election. Graham said he still wants an examination of the FBI’s use of a dossier on links between Trump and Russia that was compiled by a former British spy, Christopher Steele, which prompted Trump allies and some Republicans to question Mueller’s inquiry.   On Saturday, the New York Times reported that it was not that dossier that triggered an early FBI probe, but a tip from former Trump campaign foreign policy adviser George Papadopoulos to an Australian diplomat that Russia had damaging information about former Trump rival Hillary Clinton.  “I want somebody to look at the way the Department of Justice used this dossier. It bothers me greatly the way they used it, and I want somebody to look at it,” Graham said. But he said the Russia investigation must continue. “As a matter of fact, it would hurt us if we ignored it,” he said. ' 

print(topic_preprocess(text))

[(0, 0.0630640747973534), (18, 0.020609302817723104), (19, 0.15387661217226292), (50, 0.04488019472778492), (52, 0.044539876958436406), (60, 0.03410322337525922), (61, 0.03209961257312386), (76, 0.019759801396857915), (90, 0.05563363852409356), (102, 0.04573375458652516), (106, 0.04180469038753974), (110, 0.20559324917334365), (124, 0.023293427382694194), (125, 0.04998384880175368), (138, 0.026169239538019667), (140, 0.043065346061500934), (150, 0.03097346697214275), (155, 0.08641863923306929), (169, 0.09605100049965742), (173, 0.032832981567750864), (180, 0.0423381198999444), (181, 0.020641173417236), (186, 0.023891827406085326), (189, 0.20196668286594258), (195, 0.0663901160677921), (196, 0.03445425715814743), (197, 0.03384255408349029), (201, 0.030139608848495487), (205, 0.014156029831098978), (216, 0.08138936610877293), (225, 0.0573063845155341), (278, 0.029238562700818666), (281, 0.032007013860166744), (297, 0.049677727337640566), (309, 0.04849918971288157), (316, 0.04097534409272

In [19]:
def doc_to_topics(text):
    return lda_tfidf_model[topic_preprocess(text)]

[(5, 0.54721564), (13, 0.3566922)]


<br><br>


# LSTM +  Statistics

In [30]:
LSTM_model = keras.models.load_model(os.path.join('keras', 'LSTM_tree'))

In [42]:
series_len = 100
mask_value = -10
max_len = 0

num_topics = 15

def split_sentences(article_text):
    """Takes a string, returns a list of its individual sentences ()"""
    sentence_list = nltk.tokenize.sent_tokenize(article_text)
    return pd.Series(sentence_list)

def sentences_to_scores(text: list, method='VADER'):
    """Takes a list of sentences, returns a list of sentiment scores (per sentence)"""
    if method == 'VADER':
        scores = text.apply(lambda s: sia.polarity_scores(s)['compound']) #list of compound score per sentence
    else:
        scores = None
    return scores

def scores_to_sequence(scores):
    if len(scores) < series_len:
        t  = series_len - len(scores)
        scores = np.pad(scores, (t, 0), mode='constant', constant_values=mask_value)
        scores = scores.reshape((series_len, 1))
    else:
        scores = np.array(scores[(-1*series_len):])
        scores = scores.reshape((series_len, 1))
    return scores

def doc_to_stats(doc):
    """takes a text, returns a dict with the numpy descibe of the sentiment stats and LSTM prediction"""
    scores = sentences_to_scores(split_sentences(doc))
    sequence = scores_to_sequence(scores)
    output = {'LSTM_pred' : float(LSTM_model(np.array([sequence])))}
    output.update(scores.describe().to_dict())
    return (output)


print(doc_to_stats(text))

{'LSTM_pred': 0.617904543876648, 'count': 17.0, 'mean': -0.01017647058823531, 'std': 0.45047517208139753, 'min': -0.6808, '25%': -0.2732, '50%': 0.0, '75%': 0.3818, 'max': 0.7269}


<br><br>

# combination definition

In [59]:
%%time

stat_cols = ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max', 'LSTM_pred']
topic_cols = [('T'+str(x)) for x in range(0, num_topics)]

# text to list of topic scores

def doc_to_row(doc, veracity):
    new_row = dict.fromkeys(topic_cols , 0)
    
    new_row.update(doc_to_stats(doc)) # add the sentiment key and value pairs
    
    for index, score in doc_to_topics(doc): # update the topic distributions
        new_row['T'+str(index)] = score

    new_row['veracity'] = veracity
        
    return(new_row)

print(doc_to_row(text, 1))

{'T0': 0, 'T1': 0, 'T2': 0, 'T3': 0, 'T4': 0, 'T5': 0.5471641, 'T6': 0, 'T7': 0, 'T8': 0, 'T9': 0, 'T10': 0, 'T11': 0, 'T12': 0, 'T13': 0.3567433, 'T14': 0, 'LSTM_pred': 0.617904543876648, 'count': 17.0, 'mean': -0.01017647058823531, 'std': 0.45047517208139753, 'min': -0.6808, '25%': -0.2732, '50%': 0.0, '75%': 0.3818, 'max': 0.7269, 'veracity': 1}
Wall time: 355 ms


<br><br>

# Applying to the new Data

In [73]:
%%time

cols = ['file', 'text', 'veracity', 'prediction_emiel']

p_RealBuzz = os.path.join('sources', 'akkerman', 'RealBuzzfeed')
p_RealPoli = os.path.join('sources', 'akkerman', 'RealPolitifact')
p_FakeBuzz = os.path.join('sources', 'akkerman', 'FakeBuzzfeed')
p_FakePoli = os.path.join('sources', 'akkerman', 'FakePolitifact')

directories = [[p_RealBuzz, 'TRUE'],
               [p_RealPoli, 'TRUE'],
               [p_FakeBuzz, 'FAKE'],
               [p_FakePoli, 'FAKE']]

df_Complete = pd.DataFrame()

for path, veracity in directories:
    df_part = None
    df_part = pd.DataFrame(columns=stat_cols+topic_cols+['veracity'])
    
    for filename in glob.glob(os.path.join(path, '*.json')): # loop over .json files in the cwd
        with open(filename) as f:
            new_row = doc_to_row(json.load(f)['text'], veracity)
            df_part = df_part.append(new_row, ignore_index=True)
      
    df_Complete = df_Complete.append(df_part, ignore_index=True)

display(df_Complete)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,LSTM_pred,T0,...,T6,T7,T8,T9,T10,T11,T12,T13,T14,veracity
0,30.0,-0.186697,0.581253,-0.9360,-0.690150,-0.1217,0.245100,0.9041,0.508691,0,...,0,0,0.0319098,0,0,0,0,0.850386,0.0506049,TRUE
1,24.0,0.274229,0.478666,-0.7003,-0.007150,0.4215,0.653100,0.8750,0.787888,0,...,0,0,0,0.0504311,0,0,0,0.781340,0,TRUE
2,54.0,0.175761,0.486247,-0.8807,0.000000,0.1492,0.621775,0.9153,0.418227,0,...,0,0,0.0316043,0,0,0,0,0.841872,0.0717724,TRUE
3,16.0,-0.168044,0.600156,-0.9231,-0.603800,-0.4404,0.288900,0.9552,0.368838,0,...,0,0,0.0579058,0,0,0,0,0.834327,0,TRUE
4,27.0,-0.195730,0.504106,-0.8750,-0.618150,-0.1027,0.036900,0.8658,0.531730,0.0488056,...,0,0.0375881,0,0.0475887,0.0228834,0,0,0.770457,0,TRUE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
417,6.0,0.326400,0.547950,-0.7430,0.327175,0.5103,0.649775,0.7184,0.341741,0.0658839,...,0,0,0.0918253,0,0,0,0,0.713498,0.0289415,FAKE
418,21.0,-0.231776,0.322246,-0.8622,-0.510600,0.0000,0.000000,0.2023,0.514779,0.051523,...,0,0,0.117269,0,0,0,0,0.698151,0,FAKE
419,11.0,0.084145,0.504422,-0.7351,-0.318000,0.0000,0.513400,0.7269,0.359130,0.113842,...,0.0100423,0.0100423,0.0100435,0.0100423,0.0100423,0.0100423,0.0321558,0.703802,0.0208956,FAKE
420,12.0,-0.060133,0.320544,-0.6124,-0.311875,-0.0258,0.156575,0.4019,0.588307,0,...,0.0356699,0,0,0,0,0,0,0.546104,0,FAKE


Wall time: 3min 17s


In [74]:
df_Complete.to_csv(os.path.join('out','AkkermanData_Emiel.csv'), index=False)