In [None]:
%%capture
#The line above is used to catch warnings and errors (careful this will hide all cell's output)

#This part 2 of the 6 parts tutorial shows you how to topic modelling! 
#Our data-set is a list of talks and abstracts from the CCC conference https://gitlab.com/maxigas/cccongresstalks/
#The data is available as .csv file format, we'll play with the data to show what topics are being focused on 
#    using Latent Dirichlet Allocation (LDA).
# In natural language processing, the latent Dirichlet allocation (LDA) is a generative statistical model that allows sets of observations to be explained by unobserved groups that explain why some parts of the data are similar. 

#this is to avoid showing warning, comment out otherwise.
import warnings
warnings.filterwarnings("ignore")

import numpy as np # adding support for large, multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays.
import pandas as pd # needed for data processing and reading CSV files I/O (e.g. pd.read_csv)

import os
import matplotlib.pyplot as plt #plotting library
import seaborn as sns#data visualization library
import mglearn
from sklearn.feature_extraction.text import CountVectorizer #CountVectorizer used to transform a given text into a vector on the basis of the frequency (count) of each word that occurs in the entire text.

In [None]:
#Reading the 2016.csv CCC talks, which is stored as a CSV file in the csvs directory.
#notice the delimiter is not a comma, check your files first.
File2016_df = pd.read_csv("csvs/2016.csv", delimiter='|', header=0)
print('Number of titles: {:,}\n'.format(File2016_df.shape[0]))
#print a sample of 5 rows.
File2016_df.sample(5)

In [None]:
#reading titles (title is a column in the csv file as shown in the sample above)

#loop through the titles and store them in a dictionary.
title_dict = {}

# loop through titles
for i in range(len(File2016_df)):
    if File2016_df["title"][i] in title_dict.keys():
        title_dict[File2016_df["title"][i]] += 1
    else:
        title_dict.setdefault(File2016_df["title"][i], 1)
        


In [None]:
#just to show you the dictionary items, you don't need to print it
for i in title_dict:
    print(i, ',', title_dict[i])

In [None]:
#we do the same thing for the abstract column (I will only only use the title_dict in this tutorial)
#Try this again later using the abstract_dict

abstract_dict = {}

# classify that the article has recommends
for i in range(len(File2016_df)):
    if File2016_df["abstract"][i] in abstract_dict.keys():
        abstract_dict[File2016_df["abstract"][i]] += 1
    else:
        abstract_dict.setdefault(File2016_df["abstract"][i], 1)

abstract_dict = [x for x in abstract_dict if str(x) != 'nan']#some talks have no abstracts

In [None]:
#spacy tokenizer
#a spacy method to extract clean tokens from text (noticed I added some extra delimiters to the tokenizer)
import spacy
spacy.load('en_core_web_sm')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    all_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:#checks if a token resembels a URL
            all_tokens.append('URL')
        elif token.orth_.startswith('@'):#needed when mentions are used to anonymise users
            all_tokens.append('SCREEN_NAME')
        else:
            all_tokens.append(token.lower_)
    return all_tokens

In [None]:
#NLTK’s Wordnet used to find the meanings of words, synonyms, antonyms, and more. 
#I import English and German stop words lists since the talks contain German text, especially in the earlier conferences.

import nltk
nltk.download('wordnet')#you may turn this one off if you've already downloaded the wordnet
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
nltk.download('stopwords')#download the stopword lists from NLTK. Can be turned off if already downloaded
en_stop = set(nltk.corpus.stopwords.words('english'))
de_stop = set(nltk.corpus.stopwords.words('german'))

#preprocess text by removing stop words and keeping words with more than 4 letters (my choice you can change it)
def preprocess_text(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop or token in de_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [None]:
#I convert the dictionary into a list with clean pre-processed text
title_text_data = []
print(type(title_text_data))
counter = 0
for title in title_dict:
    counter = counter + 1
    title_tokens = preprocess_text(title.replace('attackz', 'attacks').replace('hackz', 'hack').replace('securityz', 'security'))#I do some replacing for German-style words!
    if (counter < 11):#this is just to print the first 10 lines
        print(title_tokens)
    title_text_data.append(title_tokens)

In [None]:
#The same is done to the abstracts dictionary (again I'm only using the talks titles in this tutorial but here just to show you how to prepare the abstracts as well (check the csv file if confused))
abstract_text_data = []
counter2 = 0
for abstract in abstract_dict:
    counter2 = counter2 + 1
    abstract_tokens = preprocess_text(abstract.replace('attackz', 'attacks').replace('hackz', 'hack').replace('securityz', 'security'))
    if (counter2 < 3):
        print(abstract_tokens)
    abstract_text_data.append(abstract_tokens)

In [None]:
#gensim for titles
#what is a Gensim? The next important object you need to familiarize yourself with in order to work in gensim is the Corpus (a Bag of Words). 
#That is, it is a corpus object that contains the word id and its frequency in each document. 
#You can think of it as gensim's equivalent of a Document-Term matrix
#The main function is doc2bow , which converts a collection of words to its bag-of-words representation: a list of (word_id, word_frequency) 2-tuples
from gensim import corpora
title_dictionary = corpora.Dictionary(title_text_data)
title_corpus = [title_dictionary.doc2bow(text) for text in title_text_data]

import pickle
#Pickle is needed to serialize gensim into a stream of bytes to be stored as a file. Serialization refers to the process of converting an object in memory to a byte stream that can be stored on disk or sent over a network.
#gensim and pkl files are stored in the 'models' directory. 
#we save those models to load them later when we plot the LDA model
pickle.dump(title_corpus, open('models/title_corpus.pkl', 'wb'))
title_dictionary.save('models/title_dictionary.gensim')

In [None]:
#gensim for abstracts
from gensim import corpora
abstract_dictionary = corpora.Dictionary(abstract_text_data)
abstract_corpus = [abstract_dictionary.doc2bow(text) for text in abstract_text_data]

import pickle
pickle.dump(abstract_corpus, open('models/abstract_corpus.pkl', 'wb'))
abstract_dictionary.save('models/abstract_dictionary.gensim')

In [None]:
#title LDA topics (I went with 5 topics and 10 passes, you can update that to show more topics or have several passes --> Number of passes through the corpus during training.)
# probability distribution (Jensen–Shannon divergence, mesures similarity between two probability distributions https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence)
# Discover abstract “topics” that occur in a collection of documents (why some parts of the data are similar)
# What are the topics they are talking about in the conference talks.

#to run this on the abstracts you need to replace the title_corpus and the title_dictionary with the abstract ones from the preivous cell
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(title_corpus, num_topics = NUM_TOPICS, id2word=title_dictionary, passes=10)
ldamodel.save('models/title_model.gensim')
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

In [None]:
#Try a new sentence on networking security, 
#LDA will show that topic 3 has the highest probability assigned, and topic 4 has the second highest probability assigned. 
#The printed probabilities should add up to 1.
new_doc = 'Our new technique offers a full spectrum of data privacy and security services and technology'
new_doc = preprocess_text(new_doc)
new_doc_bow = title_dictionary.doc2bow(new_doc)
print(ldamodel.get_document_topics(new_doc_bow))

In [None]:
#display and plot title topics
#to understand the model: http://bl.ocks.org/AlessandraSozzi/raw/ce1ace56e4aed6f2d614ae2243aab5a5/
#and https://www.youtube.com/watch?v=IksL96ls4o0&feature=emb_title&ab_channel=statgraphics
#Saliency: how important regardless of the term frequency (i.e. some terms are frequent but not informative)
#Paper and more details: http://vis.stanford.edu/files/2012-Termite-AVI.pdf
#Relevance: how relevant is a term to a certain topic
#Paper and more details: https://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf
#The size of the bubble measures the importance of the topics, relative to the data.
#more weight on lambda shows terms that are relevant to the topic. Less weight shows terms that are most frequent in that topic (beware of stop-words)

#may struggle with numpy >=1.20.x
dictionary = gensim.corpora.Dictionary.load('models/title_dictionary.gensim')
corpus = pickle.load(open('models/title_corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('models/title_model.gensim')
import pyLDAvis.gensim
pyLDAvis.enable_notebook()#this to make it run with Jupyter notebook, not needed if using python IDE
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.save_html(lda_display, './models/titles_lda.html')#save the LDA visualisation to the models directory for you to use later.
pyLDAvis.display(lda_display, template_type='notebook')

In [None]:
%%time
import gensim
import warnings
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
#lda = gensim.models.ldamodel.LdaModel.load('models/title_model.gensim')

In [None]:
fiz=plt.figure(figsize=(15,30))
for i in range(5):
    df=pd.DataFrame(lda.show_topic(i), columns=['term','prob']).set_index('term')
#     df=df.sort_values('prob')
    
    plt.subplot(5,2,i+1)
    plt.title('topic '+str(i+1))
    sns.barplot(x='prob', y=df.index, data=df, label='Cities', palette='Reds_d')
    plt.xlabel('probability')
    

plt.show()

In [None]:
#***********Try it yourself*******

#so now try to replicate the LDA topics and visualisation process on the abstracts. I've already created the pkl and gensim files for you.

