In [80]:
# imports
import os
import pandas as pd
import numpy as np

import sys

from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
sia = SIA()

import gensim

import nltk
#file for punkt splitter
nltk.download('punkt');
#file for vader sentiment
nltk.download('vader_lexicon');

#wordnet lemmatization
nltk.download('wordnet')
#more for preprocessing
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
stemmer = PorterStemmer()

import pyLDAvis.gensim

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"]=20,20
%matplotlib inline

import pickle

import warnings; warnings.simplefilter('ignore')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emiel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\emiel\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\emiel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [22]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Masking, Embedding

<br>

## 1: Load the dataset <a class="anchor" id="chapter1"></a>

In [9]:
dataset_path_true = os.path.join("sources", "ISOT", "True.csv")
dataset_path_fake = os.path.join("sources", "ISOT", "Fake.csv")

df_true = pd.read_csv(dataset_path_true, encoding='utf-8') # make sure to use the right encoding
df_fake = pd.read_csv(dataset_path_fake, encoding='utf-8') 

dfm_true = df_true.head()
dfm_fake = df_fake.head()

display(dfm_true)
display(dfm_fake)

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [10]:
corpus_texts = [] # list of all documents (by entire body)

#corpus_texts = df_true['text'].tolist() + df_fake['text'].tolist() 
corpus_texts = df_true['text'].tolist() + df_fake['text'].tolist() 

print("amount of documents in corpus: ", len(corpus_texts))

amount of documents in corpus:  44898


## X: Creating the new Data

### Loading models

In [15]:
with open(os.path.join('gensim', '02', 'tfidf_corpus.pickle'), 'rb') as f: tfidf_corpus_texts = pickle.load(f)
    
lda_tfidf_model = gensim.models.LdaMulticore.load(os.path.join("gensim", "02", "lda_model"))

In [33]:
# model = keras.models.load_model(os.path.join('keras', 'LSTM_tree'))

pre_activation_layer_name = 'dense_16'

model_pre_activation = keras.Model(inputs=model.input,
                                   outputs=model.get_layer(pre_activation_layer_name).output)

model_pre_activation.summary()


Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_16_input (InputLayer [(None, 100, 1)]          0         
_________________________________________________________________
masking_16 (Masking)         (None, 100, 1)            0         
_________________________________________________________________
lstm_50 (LSTM)               (None, 100, 100)          40800     
_________________________________________________________________
dropout_42 (Dropout)         (None, 100, 100)          0         
_________________________________________________________________
lstm_51 (LSTM)               (None, 100, 50)           30200     
_________________________________________________________________
dropout_43 (Dropout)         (None, 100, 50)           0         
_________________________________________________________________
lstm_52 (LSTM)               (None, 100, 25)          

In [50]:
series_len = 100
mask_value = -10
max_len = 0

def text_to_sequence(text):
    # feed corpus_texts[entry], returns 100 units long -10 padded sequence of sentiment data
    sentences = nltk.tokenize.sent_tokenize(text)
    scores = [ sia.polarity_scores(s)['compound'] for s in sentences] #list of compound score per sentence
    
    if len(scores) < series_len:
        t  = series_len - len(scores)
        scores = np.pad(scores, (t, 0), mode='constant', constant_values=mask_value)
        scores = scores.reshape((series_len, 1))
    else:
        scores = np.array(scores[(-1*series_len):])
        scores = scores.reshape((series_len, 1))
        
    return scores

def scores_to_sequence(scores):
    if len(scores) < series_len:
        t  = series_len - len(scores)
        scores = np.pad(scores, (t, 0), mode='constant', constant_values=mask_value)
        scores = scores.reshape((series_len, 1))
    else:
        scores = np.array(scores[(-1*series_len):])
        scores = scores.reshape((series_len, 1))
    return scores


sample_sentences = split_sentences(corpus_texts[2])
sample_scores = get_scores(sample_sentences)
sample_sequence = scores_to_sequence(sample_scores)
sample_prediction = float(model(np.array([sample_sequence])))

print(sample_prediction)

0.617904543876648


In [53]:
%%time

num_topics = 15

def split_sentences(article_text):
    """Takes a string, returns a list of its individual sentences ()"""
    return pd.Series(nltk.tokenize.sent_tokenize(article_text))

def get_scores(text: list, method='VADER'):
    if method == 'VADER':
        scores = text.apply(lambda s: sia.polarity_scores(s)['compound']) #list of compound score per sentence
    else:
        scores = None

    return scores

def sentiment_stats(doc):
    doc_sentence_list = split_sentences(doc)
    sentiments = get_scores(doc_sentence_list)
    sequence = scores_to_sequence(sentiments)
    output = {'LSTM_pred' : float(model(np.array([sequence])))}
    output.update(sentiments.describe().to_dict())
    return (output)


sentiment_stats(corpus_texts[2])

Wall time: 354 ms


{'LSTM_pred': 0.617904543876648,
 'count': 17.0,
 'mean': -0.01017647058823531,
 'std': 0.45047517208139753,
 'min': -0.6808,
 '25%': -0.2732,
 '50%': 0.0,
 '75%': 0.3818,
 'max': 0.7269}

### functions for operation

In [54]:
%%time

stat_cols = ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max', 'LSTM_pred']
topic_cols = [('T'+str(x)) for x in range(0, num_topics)]+['Veracity']

# text to list of topic scores

def docid_to_row(doc_id):
    new_row = dict.fromkeys(topic_cols , 0)
    
    new_row.update(sentiment_stats(corpus_texts[doc_id])) # add the sentiment key and value pairs
    
    for index, score in lda_tfidf_model[tfidf_corpus_texts[doc_id]]: # update the topic distributions
        new_row['T'+str(index)] = score
        
    if(doc_id<21417): # the first 21417 items in the DB are 
        new_row['Veracity'] = 1
        
    return(new_row)

print(docid_to_row(2))

{'T0': 0, 'T1': 0, 'T2': 0, 'T3': 0, 'T4': 0, 'T5': 0.54721564, 'T6': 0, 'T7': 0, 'T8': 0, 'T9': 0, 'T10': 0, 'T11': 0, 'T12': 0, 'T13': 0.3566922, 'T14': 0, 'Veracity': 1, 'LSTM_pred': 0.617904543876648, 'count': 17.0, 'mean': -0.01017647058823531, 'std': 0.45047517208139753, 'min': -0.6808, '25%': -0.2732, '50%': 0.0, '75%': 0.3818, 'max': 0.7269}
Wall time: 502 ms


In [81]:
%%time

df_topic_sentiment = None
df_topic_sentiment = pd.DataFrame(columns=stat_cols+topic_cols)

count_texts = len(corpus_texts)
for doc_id in range(count_texts):
    df_topic_sentiment = df_topic_sentiment.append(docid_to_row(doc_id), ignore_index=True)
    sys.stdout.write('\r')
    sys.stdout.write(str("current progress: {}%".format(str(np.round(doc_id/count_texts*100, 3)))))

sys.stdout.write('\r')
sys.stdout.write('building table done!')
display(df_topic_sentiment.head())

building table done!.998%

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,LSTM_pred,T0,...,T6,T7,T8,T9,T10,T11,T12,T13,T14,Veracity
0,30.0,0.076147,0.402857,-0.6369,-0.077025,0.0,0.3182,0.9062,0.830285,0.0,...,0.0,0.0,0.0,0.0,0.026417,0.0,0.0,0.855568,0.054737,1.0
1,21.0,0.018648,0.486143,-0.8625,-0.2924,0.0772,0.34,0.7531,0.331971,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.776423,0.143212,1.0
2,17.0,-0.010176,0.450475,-0.6808,-0.2732,0.0,0.3818,0.7269,0.617905,0.047824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.342261,0.0,1.0
3,16.0,-0.029581,0.438841,-0.6249,-0.38265,0.0,0.35045,0.743,0.57173,0.179278,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.280279,0.0,1.0
4,40.0,0.047153,0.327357,-0.6124,-0.156025,0.0,0.26705,0.6249,0.785731,0.0,...,0.0,0.051978,0.0,0.0,0.0,0.0,0.0,0.842692,0.0,1.0


Wall time: 4h 22min 32s


In [82]:
df_topic_sentiment.to_csv(os.path.join('out','everything.csv'), index=False)