In [2]:
# imports
import os
import pandas as pd
import numpy as np

from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
sia = SIA()

import gensim

import nltk
#file for punkt splitter
nltk.download('punkt');
#file for vader sentiment
nltk.download('vader_lexicon');

#wordnet lemmatization
nltk.download('wordnet')
#more for preprocessing
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
stemmer = PorterStemmer()

import pyLDAvis.gensim

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"]=20,20
%matplotlib inline

import pickle

import warnings; warnings.simplefilter('ignore')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emiel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\emiel\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\emiel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


<br>

## 1: Load the dataset <a class="anchor" id="chapter1"></a>

In [96]:
dataset_path_true = os.path.join("sources", "ISOT", "True.csv")
dataset_path_fake = os.path.join("sources", "ISOT", "Fake.csv")

df_true = pd.read_csv(dataset_path_true, encoding='utf-8') # make sure to use the right encoding
df_fake = pd.read_csv(dataset_path_fake, encoding='utf-8') 

dfm_true = df_true.head()
dfm_fake = df_fake.head()

display(dfm_true)
display(dfm_fake)

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


21417


<br>

## 2: Split the text into sentences <a class="anchor" id="chapter2"></a>

in lab 03 we used a big manual function, but for now we will use the nltk tokenizer here :)

In [4]:
def split_sentences(article_text):
    """Takes a string, returns a list of its individual sentences ()"""
    return pd.Series(nltk.tokenize.sent_tokenize(article_text))

sample_sentences = split_sentences(dfm_true.text[0])
display(sample_sentences.iloc[[0,1,-2,-1]])

0     WASHINGTON (Reuters) - The head of a conservat...
1     In keeping with a sharp pivot under way among ...
28    The package far exceeded the $44 billion reque...
29             The Senate has not yet voted on the aid.
dtype: object

### 2.1 Corpus of Sentences

In [4]:
corpus_sentences = [] # a list of all documents (by sentence)

### 2.2 Corpus of Entire Texts

In [5]:
corpus_texts = [] # list of all documents (by entire body)

#corpus_texts = df_true['text'].tolist() + df_fake['text'].tolist() 
corpus_texts = df_true['text'].tolist() + df_fake['text'].tolist() 

print("amount of documents in corpus: ", len(corpus_texts))

amount of documents in corpus:  44898


<br>

## 3 Text Preprocessing
In order to do proper topic analysis the text needs to become understandable by removing unineteresting properties of the text. We lower case it, stem and lemmatize it, and remove all words under 3 characters or stopwords (it them ...).

Now with bi and tri grams!

In [12]:
def stem_lemmatize(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

In [83]:
def prepare_text(text):
    #all the nice preprocessing without the bigrams and trigrams
    output = []
    
    
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            output.append(stem_lemmatize(token))
    return output

print(prepare_text(corpus_texts[7]))

['follow', 'statement', 'post', 'verifi', 'twitter', 'account', 'presid', 'donald', 'trump', 'realdonaldtrump', 'potu', 'opinion', 'express', 'reuter', 'edit', 'statement', 'confirm', 'accuraci', 'realdonaldtrump', 'fake', 'news', 'love', 'talk', 'call', 'approv', 'rat', 'foxandfriend', 'show', 'rat', 'approxim', 'presid', 'obama', 'despit', 'massiv', 'neg', 'trump', 'coverag', 'russia', 'hoax', 'unit', 'state', 'post', 'offic', 'lose', 'billion', 'dollar', 'year', 'charg', 'amazon', 'littl', 'deliv', 'packag', 'make', 'amazon', 'richer', 'post', 'offic', 'dumber', 'poorer', 'charg', 'sourc', 'link', 'jpexyr']


In [14]:
prepared_corpus_texts = [prepare_text(d) for d in corpus_texts]
print(processed_corpus_texts[7])

['follow', 'statement', 'post', 'verifi', 'twitter', 'account', 'presid', 'donald', 'trump', 'realdonaldtrump', 'potu', 'opinion', 'express', 'reuter', 'edit', 'statement', 'confirm', 'accuraci', 'realdonaldtrump', 'fake', 'news', 'love', 'talk', 'call', 'approv', 'rat', 'foxandfriend', 'show', 'rat', 'approxim', 'presid', 'obama', 'despit', 'massiv', 'neg', 'trump', 'coverag', 'russia', 'hoax', 'unit', 'state', 'post', 'offic', 'lose', 'billion', 'dollar', 'year', 'charg', 'amazon', 'littl', 'deliv', 'packag', 'make', 'amazon', 'richer', 'post', 'offic', 'dumber', 'poorer', 'charg', 'sourc', 'link', 'jpexyr']


In [45]:
%%time

bigram = gensim.models.Phrases(processed_corpus_texts, min_count=5, threshold=20) # 5 and 10 are the default values, but this can be tweaked
trigram = gensim.models.Phrases(bigram[processed_corpus_texts], threshold=20)

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]


Wall time: 2min 8s


In [53]:
print(make_trigrams([prepared_corpus_texts[1]]))
print("\n\n")
print(trigram_mod[bigram_mod[preprocess("donald trump")]])

[['washington_reuter', 'transgend_peopl', 'allow', 'time', 'enlist', 'militari', 'start', 'monday', 'order', 'feder', 'court', 'pentagon', 'say', 'friday', 'presid', 'donald_trump', 'administr', 'decid', 'appeal', 'rule', 'block', 'transgend', 'feder_appeal_court', 'washington', 'virginia', 'week', 'reject', 'administr', 'request', 'hold', 'order', 'lower_court_judg', 'requir', 'militari', 'begin', 'accept_transgend_recruit', 'justic_depart', 'offici', 'say', 'administr', 'challeng', 'rule', 'depart', 'defens', 'announc', 'releas', 'independ', 'studi', 'issu', 'come', 'week', 'litig', 'interim', 'appeal', 'occur', 'administr', 'decid', 'wait', 'studi', 'continu', 'defend', 'presid', 'law', 'author', 'district_court', 'meantim', 'offici', 'say', 'speak_condit_anonym', 'septemb', 'pentagon', 'say', 'creat', 'panel', 'senior', 'offici', 'studi', 'implement', 'direct', 'trump', 'prohibit', 'transgend_individu_serv', 'defens', 'depart', 'submit', 'plan', 'trump', 'lawyer', 'repres', 'curren

In [84]:
def preprocess(text):
    return(trigram_mod[bigram_mod[prepare_text(text)]])

<br>

## 4 Dictionary Creation

word embeddings, all occuring words are stored and get a number (embedding) those embeddings can later be used for vector calculations. of course not all words are important, so words that appear more than 100000 times or that exist in >60%  our corpus (those are very likely words slipped the stopword list) and less than 15 time (words not important enought for a topic)

In [55]:
%%time

# Creating the gensim dictionary of word embeddings

if (0):
    dictionary = gensim.corpora.Dictionary.load(os.path.join("gensim", "dictionary"))

if (0):
    
    processed_corpus_texts = [trigram_mod[bigram_mod[doc]] for doc in prepared_corpus_texts]

    dictionary = gensim.corpora.Dictionary(processed_corpus_texts)
    print("unfiltered: ", dictionary)

    dictionary.filter_extremes(no_below=15, no_above=0.6, keep_n=100000)
    
print("  filtered: ", dictionary)

unfiltered:  Dictionary(116439 unique tokens: ['administr', 'agre', 'aid', 'approach', 'approv']...)
  filtered:  Dictionary(21592 unique tokens: ['administr', 'agre', 'aid', 'approach', 'approv']...)
Wall time: 45.2 s


<br>

## 5 Bag of Words creation
Now we create a vector representation in the form of a bag of words for eacht document. a vector that lets us know how often each word in the preprocessed text, that also exists in our dictionary, occurs

In [59]:
%%time

if (0):
    with open('bag_of_words.pickle', 'rb') as f: dictionary = pickle.load(f)

if (0):
    bow_corpus_texts = [dictionary.doc2bow(text) for text in processed_corpus_texts]
    

Wall time: 5.85 s


<br>

## 6 LDA model creation


In [60]:
%%time

num_topics = 15
if (0):
    lda_model = gensim.models.LdaMulticore.load(os.path.join("gensim", "lda_model20"))
if (0):
    lda_modelX = gensim.models.LdaMulticore(corpus = bow_corpus_texts,
                                            id2word = dictionary,
                                            num_topics = num_topics,
                                            passes = 20,
                                            iterations = 200,
                                            eta = 'auto',
                                            #alpha = 'auto',
                                            workers = 2)
    # multicore speeds up the process significantly

Wall time: 9min 7s


<br>

## 7 TF-IDF x LDA model creation 


In [62]:
# train the model
tfidf_model = gensim.models.TfidfModel(bow_corpus_texts)
tfidf_corpus_texts = tfidf_model[bow_corpus_texts]

In [76]:
%%time

num_topics = 15

if (0):
    lda_tfidf_model = gensim.models.LdaMulticore(corpus = tfidf_corpus_texts,
                                        id2word = dictionary,
                                        num_topics = num_topics,
                                        passes = 20,
                                        iterations = 200,
                                        eta = 'auto',
                                        #alpha = 'auto', # not available form multicore
                                        workers = 2)
if (0):
    lda_tfidf_model.save(os.path.join("gensim", "02", "lda_model")) #save model itself
    with open(os.path.join("gensim", "02", "tfidf_corpus.pickle"), 'wb') as f: pickle.dump(tfidf_corpus_texts, f)    #dump tfidf_corpus_texts
    dictionary.save(os.path.join("gensim", "02", "dictionary")) #save dictionary

# multicore speeds up the process significantly

Wall time: 2.64 s


In [67]:
%%time

from gensim.models import CoherenceModel# Compute Coherence Score

coherence_model_lda = CoherenceModel(model=lda_modelX, texts=processed_corpus_texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

coherence_model_lda_tfidf = CoherenceModel(model=lda_tfidf_model, texts=processed_corpus_texts, dictionary=dictionary, coherence='c_v')
coherence_lda_tfidf = coherence_model_lda_tfidf.get_coherence()

print('\nCoherence Score: ', coherence_lda)
print('\nCoherence Score: ', coherence_lda_tfidf)


Coherence Score:  0.47852099608521953

Coherence Score:  0.5345937527916345
Wall time: 1min 39s


<br>

## 8 Case Examination
take a look at some classifications

In [None]:
#TODO LDAVIS For TWEAKING SHIT SAVE PICTURES AND STEPS WHOOO

pyLDAvis.enable_notebook()
visualisation = pyLDAvis.gensim.prepare(lda_model50, bow_corpus_texts, dictionary)
pyLDAvis.save_html(visualisation, os.path.join("gensim", "LDAvis", "LDAvis.hml"))

In [86]:
article = 30000

print("\nmost frequently occuring word: ", max(preprocess(corpus_texts[article]),key=preprocess(corpus_texts[article]).count))

print("\n\nbow-lda prediction:")

for index, score in sorted(lda_modelX[bow_corpus_texts[article]],
                           key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_modelX.print_topic(index, 10)))

print("\n\ntfidf-lda prediction:")

for index, score in sorted(lda_tfidf_model[tfidf_corpus_texts[article]],
                           key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_tfidf_model.print_topic(index, 10)))
    
print("\n\n", preprocess(corpus_texts[article]), "\n\n", corpus_texts[article])


most frequently occuring word:  trump


bow-lda prediction:

Score: 0.5868434309959412	 
Topic: 0.014*"like" + 0.013*"peopl" + 0.011*"know" + 0.011*"go" + 0.010*"think" + 0.010*"obama" + 0.009*"presid" + 0.009*"american" + 0.009*"trump" + 0.008*"time"

Score: 0.29268181324005127	 
Topic: 0.161*"trump" + 0.039*"presid" + 0.026*"clinton" + 0.024*"donald_trump" + 0.017*"campaign" + 0.016*"white_hous" + 0.010*"report" + 0.008*"hillari_clinton" + 0.008*"http" + 0.008*"tell"

Score: 0.09060100466012955	 
Topic: 0.012*"year" + 0.011*"republican" + 0.009*"million" + 0.009*"plan" + 0.009*"fund" + 0.008*"congress" + 0.007*"hous" + 0.007*"american" + 0.007*"state" + 0.007*"govern"

Score: 0.022879285737872124	 
Topic: 0.015*"women" + 0.014*"black" + 0.011*"protest" + 0.010*"student" + 0.010*"white" + 0.009*"support" + 0.008*"school" + 0.008*"video" + 0.007*"twitter" + 0.007*"peopl"


tfidf-lda prediction:

Score: 0.8023303151130676	 
Topic: 0.005*"trump" + 0.003*"obama" + 0.002*"american" + 0.00

<br>

## X Saving items
because we dont want to sit waiting every time please

In [16]:
if (0):
    lda_modelXX.save(os.path.join("gensim", "lda_modelXX"))
    with open(os.path.join("gensim", "bag_of_words.pickle"), 'wb') as f: pickle.dump(bow_corpus_texts, f)    
    dictionary.save(os.path.join("gensim", "dictionary"))


<br>

## 10: Performing sentiment analysis <a class="anchor" id="chapter3"></a>
Which we can do either to the whole article or on a sentence basis and then average

In [71]:
def get_scores(text: list, method='VADER'):
    if method == 'VADER':
        scores = text.apply(lambda s: sia.polarity_scores(s)['compound']) #list of compound score per sentence
    else:
        scores = None

    return scores

In [72]:
def average_sentiment(doc):
    doc_sentence_list = split_sentences(doc)
    return float(np.average(get_scores(doc_sentence_list)))

average_sentiment(corpus_texts[10005])

0.10535714285714284

as can be seen in the following cell the entire document sentiment gives crazy extreme and unnuanced values

In [74]:
article = 10005

sample_article = corpus_texts[article]
sample_sentences = split_sentences(sample_article)

sample_scoreT = sia.polarity_scores(sample_article)['compound']
sample_scoreS = get_scores(sample_sentences)

print(sample_scoreT, "\n\n", average_sentiment(sample_article), "\n\nfrom:\n", sample_scoreS)

0.9153 

 0.10535714285714284 

from:
 0     0.4588
1    -0.3818
2    -0.3818
3     0.4019
4     0.0000
5     0.0000
6     0.1779
7     0.1531
8     0.8442
9     0.4215
10   -0.2960
11    0.0000
12    0.0772
13    0.0000
dtype: float64


In [105]:
%%time

topic_cols = ['Sentiment']+[('T'+str(x)) for x in range(0, num_topics)]+['Veracity']
# text to list of topic scores

def docid_to_row(doc_id):
    new_row = dict.fromkeys(topic_cols , 0)
    
    new_row['Sentiment'] = average_sentiment(corpus_texts[doc_id])
    for index, score in lda_tfidf_model[tfidf_corpus_texts[doc_id]]:
        new_row['T'+str(index)] = score
    if(doc_id<21417):
        new_row['Veracity'] = 1
    return(new_row)

print(docid_to_row(0))

{'Sentiment': 0.07614666666666665, 'T0': 0, 'T1': 0, 'T2': 0, 'T3': 0, 'T4': 0, 'T5': 0, 'T6': 0, 'T7': 0, 'T8': 0, 'T9': 0, 'T10': 0.026416631, 'T11': 0, 'T12': 0, 'T13': 0.8555344, 'T14': 0.054770425, 'Veracity': 1}
Wall time: 17 ms


In [106]:
%%time

df_topic_sentiment = pd.DataFrame(columns=topic_cols)

for doc_id in range(len(corpus_texts)):
    df_topic_sentiment = df_topic_sentiment.append(docid_to_row(doc_id), ignore_index=True)


Wall time: 7min 5s


In [108]:
df_topic_sentiment.to_csv(os.path.join('out', 'topic_sentiment.csv'), index=False)