In [None]:
#The First International Workshop on Arabic Big Data & AI (IWABigDAI) May 11 and May 12 2022
#https://sites.google.com/view/arabicbigdata/home

#Tutorial 4: Visualising with Topic Modelling using Latent Dirichlet Allocation (LDA)
#author: Dr Mahmoud El-Haj (with help from the Internet)
#GitHub repository: https://github.com/drelhaj/NLP_ML_Visualization_Tutorial

In [None]:
%%capture
#The line above is used to catch warnings and errors (careful this will hide all cell's output)

#This part 4 of the 4 parts tutorial shows you how to topic modelling! 
#Our data-set is a list of talks and abstracts from the CCC conference https://gitlab.com/maxigas/cccongresstalks/
#The data is available as .csv file format, we'll play with the data to show what topics are being focused on 
#    using Latent Dirichlet Allocation (LDA).
# In natural language processing, the latent Dirichlet allocation (LDA) is a generative statistical model that allows sets of observations to be explained by unobserved groups that explain why some parts of the data are similar. 

#this is to avoid showing warning, comment out otherwise.
import warnings
warnings.filterwarnings("ignore")

import numpy as np # adding support for large, multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays.
import pandas as pd # needed for data processing and reading CSV files I/O (e.g. pd.read_csv)

import os
import matplotlib.pyplot as plt #plotting library
import seaborn as sns#data visualization library
#import mglearn #Helper functions for the book 'Introduction to machine learning with Python'
from sklearn.feature_extraction.text import CountVectorizer #CountVectorizer used to transform a given text into a vector on the basis of the frequency (count) of each word that occurs in the entire text.

In [None]:
#Reading the 2016.csv CCC talks, which is stored as a CSV file in the csvs directory.
#notice the delimiter is not a comma, check your files first.
#Our data-set is a list of talks and abstracts from the CCC conference https://gitlab.com/maxigas/cccongresstalks/

File2022_df = pd.read_csv("csvs/2023-Crete.csv", delimiter=',', header=0, encoding='utf8')
print('Number of titles: {:,}\n'.format(File2022_df.shape[0]))
#print a sample of 3 random rows.
File2022_df.sample(3)

In [None]:
#reading tweets (tweet_text is a column in the csv file as shown in the sample above)

#loop through the abstracts and store them in a dictionary.

tweets_dict = {}

# classify that the article has recommends
for i in range(len(File2022_df)):
    if File2022_df["TWEETTEXT"][i] in tweets_dict.keys():
        tweets_dict[File2022_df["TWEETTEXT"][i]] += 1
    else:
        tweets_dict.setdefault(File2022_df["TWEETTEXT"][i], 1)


In [None]:
#just to show you the dictionary items, you don't need to print it
for i in tweets_dict:
    print(i, ',', tweets_dict[i])
    

In [None]:
#spacy tokenizer
#a spacy method to extract clean tokens from text (noticed I added some extra delimiters to the tokenizer)
import spacy
spacy.load('en_core_web_sm')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    all_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:#checks if a token resembels a URL
            all_tokens.append('URL')
        elif token.orth_.startswith('@'):#needed when mentions are used to anonymise users
            all_tokens.append('SCREEN_NAME')
        else:
            all_tokens.append(token.lower_)
    return all_tokens

In [None]:
%%capture
#NLTK’s Wordnet used to find the meanings of words, synonyms, antonyms, and more. 
#I import English and German stop words lists since the talks contain German text, especially in the earlier conferences.

import nltk
import re
nltk.download('wordnet')#you may turn this one off if you've already downloaded the wordnet
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
nltk.download('stopwords')#download the stopword lists from NLTK. Can be turned off if already downloaded
en_stop = set(nltk.corpus.stopwords.words('english'))

#preprocess text by removing stop words and keeping words with more than 4 letters (my choice you can change it)
def preprocess_text(text):
    
    text = re.sub('[^A-Za-z ]+', '', text)
    text = re.sub(' +', ' ', text)
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) >= 3]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [None]:
#here is an example on how the preprocessing works. 
sentenceTest = "Attending a seminar today to understand work completed on exploring research priority "
print(preprocess_text(sentenceTest))


In [None]:
#I convert the dictionary into a list with clean pre-processed text
tweet_text_data = []
print(type(tweet_text_data))
counter = 0
for entry in tweets_dict:
    counter = counter + 1
    tweet_tokens = preprocess_text(entry)
    if counter < 11:
        print(tweet_tokens)
    tweet_text_data.append(tweet_tokens)

In [None]:
#gensim for titles
#what is a Gensim? The next important object you need to familiarize yourself with in order to work in gensim is the Corpus (a Bag of Words). 
#That is, it is a corpus object that contains the word id and its frequency in each document. 
#You can think of it as gensim's equivalent of a Document-Term matrix
#The main function is doc2bow , which converts a collection of words to its bag-of-words representation: a list of (word_id, word_frequency) 2-tuples
from gensim import corpora
tweets_dictionary = corpora.Dictionary(tweet_text_data)
tweets_corpus = [tweets_dictionary.doc2bow(text) for text in tweet_text_data]

import pickle
#Pickle is needed to serialize gensim into a stream of bytes to be stored as a file. Serialization refers to the process of converting an object in memory to a byte stream that can be stored on disk or sent over a network.
#gensim and pkl files are stored in the 'models' directory. 
#we save those models to load them later when we plot the LDA model
pickle.dump(tweets_corpus, open('models/tweets_corpus.pkl', 'wb'))
tweets_dictionary.save('models/tweets_dictionary.gensim')

In [None]:
#title LDA topics (I went with 5 topics and 10 passes, you can update that to show more topics or have several passes --> Number of passes through the corpus during training.)
# probability distribution (Jensen–Shannon divergence, mesures similarity between two probability distributions https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence)
# Discover abstract “topics” that occur in a collection of documents (why some parts of the data are similar)
# What are the topics they are talking about in the conference talks.

#to run this on the abstracts you need to replace the tweets_corpus and the title_dictionary with the abstract ones from the preivous cell
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(tweets_corpus, num_topics = NUM_TOPICS, id2word=tweets_dictionary, passes=10)
ldamodel.save('models/tweets_model.gensim')
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

In [None]:
#Try a new sentence on networking security, 
#LDA will show that topic 3 has the highest probability assigned, and topic 4 has the second highest probability assigned. 
#The printed probabilities should add up to 1.
new_doc = 'I look forward to my graduation day at Lancaster University'
new_doc = preprocess_text(new_doc)
new_doc_bow = tweets_dictionary.doc2bow(new_doc)
print(ldamodel.get_document_topics(new_doc_bow))

In [None]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

#display and plot tweets topics
#to understand the model: http://bl.ocks.org/AlessandraSozzi/raw/ce1ace56e4aed6f2d614ae2243aab5a5/
#and https://www.youtube.com/watch?v=IksL96ls4o0&feature=emb_title&ab_channel=statgraphics
#Saliency: how important regardless of the term frequency (i.e. some terms are frequent but not informative)
#Paper and more details: http://vis.stanford.edu/files/2012-Termite-AVI.pdf
#Relevance: how relevant is a term to a certain topic
#Paper and more details: https://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf
#The size of the bubble measures the importance of the topics, relative to the data.
#more weight on lambda shows terms that are relevant to the topic. Less weight shows terms that are most frequent in that topic (beware of stop-words)

#may struggle with numpy >=1.20.x
dictionary = gensim.corpora.Dictionary.load('models/tweets_dictionary.gensim')
corpus = pickle.load(open('models/tweets_corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('models/tweets_model.gensim')
import pyLDAvis.gensim
pyLDAvis.enable_notebook()#this to make it run with Jupyter notebook, not needed if using python IDE
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.save_html(lda_display, './plots/tweets_lda.html')#save the LDA visualisation to the models directory for you to use later.
pyLDAvis.display(lda_display, template_type='notebook')

In [None]:
%%time
import gensim
import warnings
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
import arabic_reshaper # this was missing in your code
from bidi.algorithm import get_display
#lda = gensim.models.ldamodel.LdaModel.load('models/title_model.gensim')

In [None]:
def reshapeText(dfindex):
    listA = dfindex.tolist()
    listInv = []
    for x in listA:
        a = get_display(arabic_reshaper.reshape(x))
        listInv.append(a)
    return listInv    

In [None]:
fiz=plt.figure(figsize=(15,30))
for i in range(5):
    df=pd.DataFrame(lda.show_topic(i), columns=['term','prob']).set_index('term')
    #df=df.sort_values('prob')# uncomment to sort in ascending order
    reshapedList = reshapeText(df.index)
    plt.subplot(5,2,i+1)
    plt.title('topic '+str(i+1))
    sns.barplot(x='prob', y=reshapedList, data=df, palette='Reds_d')#'Greens_d', Blues_d
    plt.xlabel('probability')
plt.show()