In [6]:
# imports
import os
import pandas as pd
import numpy as np

from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
sia = SIA()

import gensim

import nltk
#file for punkt splitter
nltk.download('punkt');
#file for vader sentiment
nltk.download('vader_lexicon');

#wordnet lemmatization
nltk.download('wordnet')
#more for preprocessing
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
stemmer = PorterStemmer()

import pyLDAvis.gensim

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"]=20,20
%matplotlib inline

import warnings; warnings.simplefilter('ignore')


  and should_run_async(code)
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emiel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\emiel\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\emiel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


<br>

## 1: Load the dataset <a class="anchor" id="chapter1"></a>

In [2]:
dataset_path_true = os.path.join("sources", "ISOT", "True.csv")
dataset_path_fake = os.path.join("sources", "ISOT", "Fake.csv")

df_true = pd.read_csv(dataset_path_true, encoding='utf-8') # make sure to use the right encoding
df_fake = pd.read_csv(dataset_path_fake, encoding='utf-8') 

dfm_true = df_true.head()
dfm_fake = df_fake.head()

display(dfm_true)
display(dfm_fake)

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


<br>

## 2: Split the text into sentences <a class="anchor" id="chapter2"></a>

in lab 03 we used a big manual function, but for now we will use the nltk tokenizer here :)

In [3]:
def split_sentences(article_text):
    """Takes a string, returns a list of its individual sentences ()"""
    return pd.Series(nltk.tokenize.sent_tokenize(article_text))

sample_sentences = split_sentences(dfm_true.text[0])
display(sample_sentences.iloc[[0,1,-2,-1]])

0     WASHINGTON (Reuters) - The head of a conservat...
1     In keeping with a sharp pivot under way among ...
28    The package far exceeded the $44 billion reque...
29             The Senate has not yet voted on the aid.
dtype: object

### 2.1 Corpus of Sentences

In [4]:
corpus_sentences = [] # a list of all documents (by sentence)

amount of documents in corpus:  44898


### 2.2 Corpus of Entire Texts

In [None]:
corpus_texts = [] # list of all documents (by entire body)

#corpus_texts = df_true['text'].tolist() + df_fake['text'].tolist() 
corpus_texts = df_true['text'].tolist() + df_fake['text'].tolist() 

print("amount of documents in corpus: ", len(corpus_texts))

<br>

## 3 Text Preprocessing
In order to do proper topic analysis the text needs to become understandable by removing unineteresting properties of the text. We lower case it, stem and lemmatize it, and remove all words under 3 characters or stopwords (it them ...).

In [5]:
def stem_lemmatize(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    output = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            output.append(stem_lemmatize(token))
    return output

print(preprocess(corpus_texts[7]))

['follow', 'statement', 'post', 'verifi', 'twitter', 'account', 'presid', 'donald', 'trump', 'realdonaldtrump', 'potu', 'opinion', 'express', 'reuter', 'edit', 'statement', 'confirm', 'accuraci', 'realdonaldtrump', 'fake', 'news', 'love', 'talk', 'call', 'approv', 'rat', 'foxandfriend', 'show', 'rat', 'approxim', 'presid', 'obama', 'despit', 'massiv', 'neg', 'trump', 'coverag', 'russia', 'hoax', 'unit', 'state', 'post', 'offic', 'lose', 'billion', 'dollar', 'year', 'charg', 'amazon', 'littl', 'deliv', 'packag', 'make', 'amazon', 'richer', 'post', 'offic', 'dumber', 'poorer', 'charg', 'sourc', 'link', 'jpexyr']


<br>

## 4 Dictionary Creation

word embeddings, all occuring words are stored and get a number (embedding) those embeddings can later be used for vector calculations. of course not all words are important, so words that appear more than 100000 times or that exist in >60%  our corpus (those are very likely words slipped the stopword list) and less than 15 time (words not important enought for a topic)

In [5]:
%%time

# Creating the gensim dictionary of word embeddings

if (0):
    dictionary = gensim.corpora.Dictionary.load(os.path.join("gensim", "dictionary"))


if (0):
    
    processed_corpus_texts = [preprocess(d) for d in corpus_texts]

    dictionary = gensim.corpora.Dictionary(processed_corpus_texts)
    print("unfiltered: ", dictionary)

    dictionary.filter_extremes(no_below=15, no_above=0.6, keep_n=100000)
    
print("  filtered: ", dictionary)

  and should_run_async(code)


NameError: name 'corpus_texts' is not defined

<br>

## 5 Bag of Words creation
Now we create a vector representation in the form of a bag of words for eacht document. a vector that lets us know how often each word in the preprocessed text, that also exists in our dictionary, occurs

In [10]:
if (0):
    with open('bag_of_words.pickle', 'rb') as f: dictionary = pickle.load(f)

if (0):
    bow_corpus_texts = [dictionary.doc2bow(text) for text in processed_corpus_texts]
    

<br>

## 6 LDA model creation


In [7]:
%%time
if (0):
    lda_model = gensim.models.LdaMulticore.load(os.path.join("gensim", "lda_model"))
if (0):
    lda_model5 = gensim.models.LdaMulticore(bow_corpus_texts, num_topics=5, id2word=dictionary, passes=2, workers=2)
    # multicore speeds up the process significantly

Wall time: 0 ns


<br>

## 7 TF-IDF model creation 


In [None]:
# train the model
tfidf_model = gensim.models.TfidfModel(bow_corpus_texts)

<br>

## 8 Case Examination
take a look at some classifications

In [25]:
#TODO LDAVIS For TWEAKING SHIT SAVE PICTURES AND STEPS WHOOO

pyLDAvis.enable_notebook()
visualisation = pyLDAvis.gensim.prepare(lda_model50, bow_corpus_texts, dictionary)
visualisation

  and should_run_async(code)


In [8]:
article = 8

print("most frequently occuring word", max(preprocess(corpus_texts[article]),key=preprocess(corpus_texts[article]).count))

for index, score in sorted(lda_model50[bow_corpus_texts[article]],
                           key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model50.print_topic(index, 10)))
    
print("\n\n", preprocess(corpus_texts[article]), "\n\n", corpus_texts[article])

NameError: name 'preprocess' is not defined

In [None]:
article = 8

print(max(preprocess(corpus_texts[article]),key=preprocess(corpus_texts[article]).count))

for index, score in sorted(tfidf_model[bow_corpus_texts[article]],
                           key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, tfidf_model.print_topic(index, 10)))
    
print("\n\n", preprocess(corpus_texts[article]), "\n\n", corpus_texts[article])

<br>

## X Saving items
because we dont want to sit waiting every time please

In [1]:
if (0):
    #lda_modelXX.save(os.path.join("gensim", "lda_modelXX"))
    
    bow_df = pd.read_csv((os.path.join("gensim", "bag_of_words")))
    bow_corpus_texts = bow_df.to_list()

    
#     dictionary.save(os.path.join("gensim", "dictionary"))

with open(os.path.join("gensim", "bag_of_words.pickle"), 'wb') as f: pickle.dump(bow_corpus_texts, f)

NameError: name 'os' is not defined

<br>

### X: Performing sentiment analysis <a class="anchor" id="chapter3"></a>


In [None]:
def get_scores(text: list, method='VADER'):
    if method == 'VADER':
        scores = text.apply(lambda s: sia.polarity_scores(s)['compound']) #list of compound score per sentence
    else:
        scores = None

    return scores

sample_scores = get_scores(sample_sentences)
print(sample_scores.iloc[[0,1,-2,-1]])