In [15]:
# imports
import os
import pandas as pd
import numpy as np

from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
sia = SIA()

import gensim

import nltk
#file for punkt splitter
nltk.download('punkt');
#file for vader sentiment
nltk.download('vader_lexicon');

#wordnet lemmatization
nltk.download('wordnet')
#more for preprocessing
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
stemmer = PorterStemmer()

import pyLDAvis.gensim

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"]=20,20
%matplotlib inline

import pickle

import warnings; warnings.simplefilter('ignore')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emiel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\emiel\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\emiel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


<br>

## 1: Load the dataset <a class="anchor" id="chapter1"></a>

In [2]:
dataset_path_true = os.path.join("sources", "ISOT", "True.csv")
dataset_path_fake = os.path.join("sources", "ISOT", "Fake.csv")

df_true = pd.read_csv(dataset_path_true, encoding='utf-8') # make sure to use the right encoding
df_fake = pd.read_csv(dataset_path_fake, encoding='utf-8') 

dfm_true = df_true.head()
dfm_fake = df_fake.head()

display(dfm_true)
display(dfm_fake)

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


<br>

## 2: Split the text into sentences <a class="anchor" id="chapter2"></a>

in lab 03 we used a big manual function, but for now we will use the nltk tokenizer here :)

In [3]:
def split_sentences(article_text):
    """Takes a string, returns a list of its individual sentences ()"""
    return pd.Series(nltk.tokenize.sent_tokenize(article_text))

sample_sentences = split_sentences(dfm_true.text[0])
display(sample_sentences.iloc[[0,1,-2,-1]])

0     WASHINGTON (Reuters) - The head of a conservat...
1     In keeping with a sharp pivot under way among ...
28    The package far exceeded the $44 billion reque...
29             The Senate has not yet voted on the aid.
dtype: object

### 2.1 Corpus of Sentences

In [4]:
corpus_sentences = [] # a list of all documents (by sentence)

### 2.2 Corpus of Entire Texts

In [5]:
corpus_texts = [] # list of all documents (by entire body)

#corpus_texts = df_true['text'].tolist() + df_fake['text'].tolist() 
corpus_texts = df_true['text'].tolist() + df_fake['text'].tolist() 

print("amount of documents in corpus: ", len(corpus_texts))

amount of documents in corpus:  44898


<br>

## 3 Text Preprocessing
In order to do proper topic analysis the text needs to become understandable by removing unineteresting properties of the text. We lower case it, stem and lemmatize it, and remove all words under 3 characters or stopwords (it them ...).

Now with bi and tri grams!

In [None]:
corpus_word_stream = [doc.split(" ") for doc in corpus_texts]

bigram = gensim.models.Phrases(corpus_word_stream, min_count=5, threshold=10) # 5 and 10 are the default values, but this can be tweaked
trigram = gensim.models.Phrases(bigram[data_words], threshold=10)

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]


In [53]:
def stem_lemmatize(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

In [54]:

def preprocess(text):
    output = []
    
    #remove stopwords+simpleprepro
    #bi/tri-gram building
    #lemma
    # TODO where goes stemming?
    
    
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            output.append(stem_lemmatize(token))
    return output

print(preprocess(corpus_texts[7]))

['follow', 'statement', 'post', 'verifi', 'twitter', 'account', 'presid', 'donald', 'trump', 'realdonaldtrump', 'potu', 'opinion', 'express', 'reuter', 'edit', 'statement', 'confirm', 'accuraci', 'realdonaldtrump', 'fake', 'news', 'love', 'talk', 'call', 'approv', 'rat', 'foxandfriend', 'show', 'rat', 'approxim', 'presid', 'obama', 'despit', 'massiv', 'neg', 'trump', 'coverag', 'russia', 'hoax', 'unit', 'state', 'post', 'offic', 'lose', 'billion', 'dollar', 'year', 'charg', 'amazon', 'littl', 'deliv', 'packag', 'make', 'amazon', 'richer', 'post', 'offic', 'dumber', 'poorer', 'charg', 'sourc', 'link', 'jpexyr']


<br>

## 4 Dictionary Creation

word embeddings, all occuring words are stored and get a number (embedding) those embeddings can later be used for vector calculations. of course not all words are important, so words that appear more than 100000 times or that exist in >60%  our corpus (those are very likely words slipped the stopword list) and less than 15 time (words not important enought for a topic)

In [12]:
%%time

# Creating the gensim dictionary of word embeddings

if (0):
    dictionary = gensim.corpora.Dictionary.load(os.path.join("gensim", "dictionary"))


if (1):
    
    processed_corpus_texts = [preprocess(d) for d in corpus_texts]

    dictionary = gensim.corpora.Dictionary(processed_corpus_texts)
    print("unfiltered: ", dictionary)

    dictionary.filter_extremes(no_below=15, no_above=0.6, keep_n=100000)
    
print("  filtered: ", dictionary)

unfiltered:  Dictionary(83579 unique tokens: ['action', 'administr', 'agre', 'aid', 'approach']...)
  filtered:  Dictionary(13703 unique tokens: ['action', 'administr', 'agre', 'aid', 'approach']...)
Wall time: 6min 32s


<br>

## 5 Bag of Words creation
Now we create a vector representation in the form of a bag of words for eacht document. a vector that lets us know how often each word in the preprocessed text, that also exists in our dictionary, occurs

In [13]:
%%time

if (1):
    with open('bag_of_words.pickle', 'rb') as f: dictionary = pickle.load(f)

if (0):
    bow_corpus_texts = [dictionary.doc2bow(text) for text in processed_corpus_texts]
    

<br>

## 6 LDA model creation


In [49]:
%%time

num_topics = 15
if (0):
    lda_model = gensim.models.LdaMulticore.load(os.path.join("gensim", "lda_model20"))
if (1):
    lda_modelX = gensim.models.LdaMulticore(corpus = bow_corpus_texts,
                                            id2word = dictionary,
                                            num_topics = num_topics,
                                            passes = 20,
                                            iterations = 200,
                                            eta = 'auto',
                                            #alpha = 'auto',
                                            workers = 2)
    # multicore speeds up the process significantly

NameError: name 'model' is not defined

In [52]:
top_topics = lda_modelX.top_topics(bow_corpus_texts)

avg_topic_coherence = sum([t[1] for t in top_topics]) / 15
print('Average topic coherence: %.4f.' % avg_topic_coherence)

print(top_topics)

Average topic coherence: -1.5653.
[([(0.015163837, 'peopl'), (0.009979308, 'american'), (0.009363801, 'black'), (0.009183135, 'right'), (0.008101126, 'like'), (0.0076126237, 'america'), (0.00614769, 'live'), (0.006065929, 'student'), (0.0058811433, 'school'), (0.005828118, 'white'), (0.00544905, 'support'), (0.00516922, 'want'), (0.004894576, 'nation'), (0.004520604, 'think'), (0.004476691, 'http'), (0.004425941, 'women'), (0.004354366, 'countri'), (0.0041992464, 'go'), (0.004044317, 'year'), (0.004040936, 'come')], -1.2379890213623088), ([(0.012892715, 'news'), (0.011615788, 'like'), (0.011436488, 'time'), (0.011379721, 'know'), (0.011010983, 'imag'), (0.010515623, 'go'), (0.00945052, 'media'), (0.008276315, 'video'), (0.008199434, 'twitter'), (0.008135565, 'think'), (0.007475627, 'fact'), (0.0072948793, 'watch'), (0.0071493187, 'peopl'), (0.007148303, 'claim'), (0.0067389724, 'featur'), (0.006581249, 'thing'), (0.0062599415, 'actual'), (0.006170679, 'fake'), (0.006101773, 'hillari'),

<br>

## 7 TF-IDF x LDA model creation 


In [18]:
# train the model
tfidf_model = gensim.models.TfidfModel(bow_corpus_texts)

<br>

## 8 Case Examination
take a look at some classifications

In [None]:
#TODO LDAVIS For TWEAKING SHIT SAVE PICTURES AND STEPS WHOOO

pyLDAvis.enable_notebook()
visualisation = pyLDAvis.gensim.prepare(lda_model50, bow_corpus_texts, dictionary)
pyLDAvis.save_html(visualisation, os.path.join("gensim", "LDAvis", "LDAvis.hml"))

In [29]:
article = 30000

print("\nmost frequently occuring word: ", max(preprocess(corpus_texts[article]),key=preprocess(corpus_texts[article]).count))

print("\n\ntf-idf representation:")

print(tfidf_model[bow_corpus_texts[article]])

print("\n\nlda prediction:")

for index, score in sorted(lda_model[bow_corpus_texts[article]],
                           key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))
    
print("\n\n", preprocess(corpus_texts[article]), "\n\n", corpus_texts[article])


most frequently occuring word:  trump


tf-idf representation:
[(54, 0.05205938794681232), (55, 0.05354190205365242), (61, 0.07113221361508652), (75, 0.03997404751583847), (79, 0.03459992716929177), (97, 0.07092158048519583), (98, 0.06297481560343694), (118, 0.04558896010987835), (120, 0.035550290840993065), (121, 0.04885654036858967), (137, 0.05105277762175444), (151, 0.01199919208264615), (158, 0.04117145008710115), (188, 0.05748465301769279), (189, 0.0648661753742098), (207, 0.08068109216841919), (210, 0.10997739313905487), (225, 0.016125794129544953), (254, 0.04185089721229258), (274, 0.045280672961440026), (278, 0.07945161777257916), (351, 0.03860218098862748), (393, 0.04261665742505229), (397, 0.06539403011355686), (433, 0.14846168836052567), (437, 0.060568827866825485), (457, 0.03815734324231499), (490, 0.039460953040126645), (499, 0.04778419034445287), (501, 0.031689640794038396), (519, 0.06606495240923384), (531, 0.04252023729317975), (539, 0.03959293098379113), (546, 0.21322

<br>

## X Saving items
because we dont want to sit waiting every time please

In [16]:
if (0):
    lda_modelXX.save(os.path.join("gensim", "lda_modelXX"))
    with open(os.path.join("gensim", "bag_of_words.pickle"), 'wb') as f: pickle.dump(bow_corpus_texts, f)    
    dictionary.save(os.path.join("gensim", "dictionary"))


<br>

## 10: Performing sentiment analysis <a class="anchor" id="chapter3"></a>
Which we can do either to the whole article or on a sentence basis and then average

In [30]:
def get_scores(text: list, method='VADER'):
    if method == 'VADER':
        scores = text.apply(lambda s: sia.polarity_scores(s)['compound']) #list of compound score per sentence
    else:
        scores = None

    return scores

In [45]:
article = 10005

sample_article = corpus_texts[article]
sample_sentences = split_sentences(sample_article)

sample_scoreT = sia.polarity_scores(sample_article)['compound']
sample_scoreS = get_scores(sample_sentences)

print(sample_scoreT, "\n\n", np.average(sample_scoreS), "\n\nfrom:\n", sample_scoreS)

0.9153 

 0.10535714285714284 

from:
 0     0.4588
1    -0.3818
2    -0.3818
3     0.4019
4     0.0000
5     0.0000
6     0.1779
7     0.1531
8     0.8442
9     0.4215
10   -0.2960
11    0.0000
12    0.0772
13    0.0000
dtype: float64
