## **LDA documentation**

> https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html#sklearn.decomposition.LatentDirichletAllocation

## **Tutorials**

> https://shravan-kuchkula.github.io/topic-modeling/#diagnose-model-performance-using-perplexity-and-log-likelihood
> https://github.com/NeverForged/LDATopicCoherence/blob/master/TopicCoherence.ipynb
> https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0
> https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/


## **Imports**

In [None]:
!pip install unidecode
!pip install pyldavis

import csv
%matplotlib inline 
import numpy as np
import pandas as pd
import seaborn as sb
# from irlb import irlb
from scipy import stats
from scipy import sparse
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import scipy.cluster.hierarchy as sch
import matplotlib.patches as mpatches
import scipy.spatial.distance as scdist
from IPython.display import display, HTML
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
#from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import TruncatedSVD
sb.set_style("whitegrid", {'axes.grid' : False})
import statsmodels.sandbox.stats.multicomp as mc
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import silhouette_score, silhouette_samples

import nltk
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords # Import the stop word list
from nltk.stem import RSLPStemmer
from textblob import TextBlob
import collections
import unidecode

from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup

import warnings
warnings.simplefilter("ignore", DeprecationWarning)

# Load the LDA model from sk-learn
from sklearn.decomposition import LatentDirichletAllocation as LDA

# from pyLDAvis import sklearn as sklearn_lda
import pickle 
import os

import pyLDAvis
import pyLDAvis.sklearn

# Load the library with the CountVectorizer method
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import make_multilabel_classification

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import joblib

sns.set_style('whitegrid')
pyLDAvis.enable_notebook()

Collecting unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |█▍                              | 10kB 15.7MB/s eta 0:00:01[K     |██▊                             | 20kB 1.6MB/s eta 0:00:01[K     |████▏                           | 30kB 2.2MB/s eta 0:00:01[K     |█████▌                          | 40kB 2.5MB/s eta 0:00:01[K     |██████▉                         | 51kB 1.9MB/s eta 0:00:01[K     |████████▎                       | 61kB 2.2MB/s eta 0:00:01[K     |█████████▋                      | 71kB 2.4MB/s eta 0:00:01[K     |███████████                     | 81kB 2.6MB/s eta 0:00:01[K     |████████████▍                   | 92kB 2.8MB/s eta 0:00:01[K     |█████████████▊                  | 102kB 2.7MB/s eta 0:00:01[K     |███████████████▏                | 112kB 2.7MB/s eta 0:00:01[K     |████████████████▌               | 122kB 2.7MB/

  import pandas.util.testing as tm


## **Topic Coherence Class**

In [None]:
# https://github.com/NeverForged/LDATopicCoherence/blob/master/TopicCoherence.ipynb
from scipy.sparse import csr_matrix

class TopicCoherence(object):
    '''
    Based on the information here: http://qpleple.com/topic-coherence-to-evaluate-topic-models/
    Calculating the topic coherence for LDA through sklearn, rather than through Gensim
    
    ATTRIBUTES
    D: Gives the Document counts, with D(wi) on the diagonal and D(wi,wj) as i,j
    p: Gives the probabilities, with p(wi) on the diagonal and the rest as p(wi,wj)
    vocabulary: saved the vocabulary from the fit, so we know what word is what
    UCI_score: UCI Score (wi,wj)
    UMass_score: UMass Score(wi,wj)
    
    METHODS:
    fit(vocabulary, documents): Creates the Document Counts and the probability
        counts for the corpus and vectorizer used.
        vocabulary: CountVectorizer.vocabulary_
        documents: the raw documents used, based on what was put into
                    the count vectorizer.
     
    
    Darin LaSota, 2/7/2019
    
    '''
    def __init__(self,words=10,score='both'):
        '''
        Initializer.
        '''
        self.words_to_use = words
        self.score = score

    def fit(self, vocabulary, transformed_docs, verbose=False):
        '''
        This is to get the various document probabilities, likely because these will be the same 
        for all models if running this in a grid_search
        '''
        self.verbose = verbose
        if verbose:
            print('Starting...')
        self.vocabulary = vocabulary  # save for later
        self.docs_words = transformed_docs > 0
        self.docs_words = self.docs_words*1.0
        self.Di = np.sum(self.docs_words,0)
        if verbose:
            print('Di done')
        self.Dij = self.docs_words.transpose() * self.docs_words
        if verbose:
            print('Dij done')       
        self.pi = self.Di/transformed_docs.shape[0]
        self.pij = self.Dij/transformed_docs.shape[0]
        if verbose:
            print('pi and pij done')
        # save the scores as made, to avoid redunency
        self.UCI_score = csr_matrix(self.Dij.shape)
        self.UMass_score = csr_matrix(self.Dij.shape)
    
    def UCI(self,i,j):
        '''
        Calculates the following:
                score(wi,wj) = log(1 + p(wi,wj)/p(wi)p(wj))
        Added the smoothing factor of 1 to keep results positive (and not heading toward 
        negative infinity)
        '''
        if i>j:
            a = i
            i = j
            j = a
        if self.UCI_score[i,j] == 0:
            self.UCI_score[i,j] = np.log(1 + self.pij[i,j]/(self.pi[0,i]*self.pi[0,j]))
        return self.UCI_score[i,j]
                      
    def UMass(self,i,j):
        '''
        Calculates the following:
            score(wi,wj) = log(1 + D[wi,wj]/D[wi])
        
        '''
        if self.UMass_score[i,j] == 0:
            self.UMass_score[i,j] = np.log(1+self.Dij[i,j]/self.Di[0,i])
        return self.UMass_score[i,j]

    def myTCScore(self,model):
        score = []
        for topic_weights in model.components_:
            top_keyword_locs = (-topic_weights).argsort()[:self.words_to_use]
            for word in top_keyword_locs:
                if self.score == 'UCI' or self.score == 'both':
                    uci = np.mean([self.UCI(word,a) for a in  top_keyword_locs if a != word])
                if self.score == 'UMass' or self.score == 'both':
                    umass = np.mean([self.UMass(word,a) for a in  top_keyword_locs if a != word])
                if self.score == 'UMass':
                    score.append(umass)
                elif self.score == 'UCI':
                    score.append(uci)
                else:
                    score.append(uci*umass)
        if self.verbose:
            print('Mean {} Score: {:.2f}'.format(self.score, np.mean(score)), end='\r')
        return np.mean(score)
        
    def tc_score(self,model,X,y=[]):
        score = []
        for topic_weights in model.components_:
            top_keyword_locs = (-topic_weights).argsort()[:self.words_to_use]
            for word in top_keyword_locs:
                if self.score == 'UCI' or self.score == 'both':
                    uci = np.mean([self.UCI(word,a) for a in  top_keyword_locs if a != word])
                if self.score == 'UMass' or self.score == 'both':
                    umass = np.mean([self.UMass(word,a) for a in  top_keyword_locs if a != word])
                if self.score == 'UMass':
                    score.append(umass)
                elif self.score == 'UCI':
                    score.append(uci)
                else:
                    score.append(uci*umass)
        if self.verbose:
            print('Mean {} Score: {:.2f}'.format(self.score, np.mean(score)), end='\r')
        return np.mean(score)

## **Getting dataset**

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
pd.set_option('display.max_colwidth', 500)
df = pd.read_csv('/content/drive/My Drive/PedroAlmir/09_Doutorado/UFC/projetos/16_PhdProposal/review/dataset/20200806_AllAcceptedPapers.csv', sep=',', quoting=csv.QUOTE_ALL)
df = df.drop(columns=['Author', 'Venue', 'Venue Type', 'Impact Factor', 'Journal', 'DOI', 'Year', 'DocType', 'Source', 'Evaluation', 'Our clustering', 'First Author Name', 'First Author Country', 'Publication Source', 'Research Type', 'Empirical Validation', 'Type of Solution', 'Contribution Type', 'User Profile'], axis=1)
df['text'] = df["Title"] + ' ' + df["Abstract"]
df = df.drop(columns=['Title', 'Abstract'], axis=1)
df.shape

(94, 2)

## **Preprocessing tasks**

In [None]:
lemmatizer = WordNetLemmatizer()

nltk.download('stopwords')
nltk.download('rslp')
nltk.download('punkt')
nltk.download('wordnet')

#stops = set(stopwords.words("english"))
file_stops = open('/content/drive/My Drive/PedroAlmir/09_Doutorado/UFC/projetos/16_PhdProposal/review/topicModeling/stop_words.txt')
file_stops = file_stops.read()
stops = file_stops.split()
print("Stopwords:", len(stops))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Unzipping stemmers/rslp.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
Stopwords: 1123


In [None]:
# palavras pequenas e forçar singular
min_length = 3
def textblob_tokenizer(str_input):
    global N, execucoes
    text = str_input
    letters_only = re.sub(u'[^a-zA-ZáéíóúÁÉÍÓÚâêîôÂÊÎÔãõÃÕçÇ ]', ' ', text)
    tokens = nltk.word_tokenize(letters_only.lower())
    tokens = [unidecode.unidecode(token) for token in tokens if not token in stops and len(token) >= min_length]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]    
    text = ' '.join(tokens)
    if len(text) > 3:
      return text
    return ""

In [None]:
abstracts = df['text'].str.lower()
df['text_processed'] = [textblob_tokenizer(abstract) for abstract in abstracts]
df.head()

Unnamed: 0,ID,text,text_processed
0,194213,"Cross-Sectoral Big Data: The Application of an Ethics Framework for Big Data in Health and Research Discussion of uses of biomedical data often proceeds on the assumption that the data are generated and shared solely or largely within the health sector. However, this assumption must be challenged because increasingly large amounts of health and well-being data are being gathered and deployed in cross-sectoral contexts such as social media and through the internet of (medical) things and wear...",cross sectoral big data application ethic framework big data health research discussion biomedical data proceeds assumption data generated shared solely largely health sector assumption challenged increasingly large amount health data gathered deployed cross sectoral context social medium internet medical thing wearable device cross sectoral sharing data refers generation linkage biomedical data health sector paper considers challenge arise phenomenon benefit fully important ethical value st...
1,194209,"Internet of things (IoT) applications for elderly care: a reflective review Increasing in elderly population put extra pressure on healthcare systems globally in terms of operational costs and resources. To minimize this pressure and provide efficient healthcare services, the application of the Internet of Things (IoT) and wearable technology could be promising. These technologies have the potential to improve the quality of life of the elderly population while reducing strain on healthcare ...",internet thing iot application elderly care reflective review increasing elderly population put extra pressure healthcare system globally term operational cost resource minimize pressure efficient healthcare service application internet thing iot wearable technology promising technology potential improve quality life elderly population reducing strain healthcare system minimizing operational cost iot wearable application elderly healthcare purpose reviewed previously summarize current applic...
2,194182,"Wearable hardware design for the internet of medical things (IoMT) As the life expectancy of individuals increases with recent advancements in medicine and quality of living, it is important to monitor the health of patients and healthy individuals on a daily basis. This is not possible with the current health care system in North America, and thus there is a need for wireless devices that can be used from home. These devices are called biomedical wearables, and they have become popular in t...",wearable hardware design internet medical thing iomt life expectancy individual increase recent advancement medicine quality living important monitor health patient healthy individual daily basis current health care system north america wireless device home device biomedical wearable popular decade reason main expensive health care longer wait time increase public awareness improving quality life vital wearable understanding designed significance factor considered hardware designed study att...
3,194165,"Biosignal monitoring using wearables: Observations and opportunities Advances in data acquisition technologies, sensor design, data frameworks, smart device connectivities, Internet-of-things, rising health care costs and public awareness towards a better quality of life, have spurred a boom in development of wearable ""health-tech"" devices in the smart device market. Tele-monitoring of human body dynamics through activities of daily life has become a popular lifestyle choice for consumers, a...",biosignal monitoring wearable observation opportunity advance data acquisition technology sensor design data framework smart device connectivity internet thing rising health care cost public awareness quality life spurred boom development wearable health tech device smart device market tele monitoring human body dynamic activity daily life popular lifestyle choice consumer help track parameter food intake calorie burnt activity level calling nearest health care facility emergency device give...
4,194110,"Liquid level sensing using commodity wifi in a smart home environment The popularity of Internet-of-Things (IoT) has provided us with unprecedented opportunities to enable a variety of emerging services in a smart home environment. Among those services, sensing the liquid level in a container is critical to building many smart home and mobile healthcare applications that improve the quality of life. This paper presents LiquidSense, a liquid level sensing system that is low-cost, high accurac...",liquid level sensing commodity wifi smart home environment popularity internet thing iot provided unprecedented opportunity enable variety emerging service smart home environment service sensing liquid level container critical building smart home mobile healthcare application improve quality life paper present liquidsense liquid level sensing system low cost high accuracy widely applicable daily liquid container easily integrated existing smart home network liquidsense existing home wifi net...


In [None]:
df.to_csv(r'/content/drive/My Drive/PedroAlmir/09_Doutorado/UFC/projetos/16_PhdProposal/review/dataset/text-processed.csv', index = False)

## **Grid Search**

In [None]:
from sklearn.model_selection import GridSearchCV

# Define Search Param
search_params = {
    'n_components':[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 
    'learning_decay':[0.5, 0.6, 0.7, 0.8, 0.9],
    'learning_offset':[8, 9, 10, 11, 12],
    'max_iter':[5, 10, 15, 20],
}
# Init the model
ldaS = LDA()
# Init Grid Search class
model = GridSearchCV(ldaS, search_params)
# Do the Grid Search
model.fit(count_data)

# Best Model
best_lda_model = model.best_estimator_
# Model Parameters
print("Best Model's Params: ", model.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# Perplexity
print("Best Model Perplexity: ", best_lda_model.perplexity(count_data))
# Topic Coherence
print("Best Topic Coherence: ", Topic_Coherence.myTCScore(best_lda_model))

Best Model's Params:  {'learning_decay': 0.5, 'learning_offset': 11, 'max_iter': 15, 'n_components': 4}
Best Log Likelihood Score:  -12658.431383193298
Best Model Perplexity:  305.1354911864333
Mean both Score: 0.32Best Topic Coherence:  0.32160021460508714


  self._set_intXint(row, col, x.flat[0])


## **LDA Algorithm**

In [None]:
# Helper function
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
def generate_topics(count_vectorizer, count_data, number_topics, number_words):
  # Create and fit the LDA model
  lda = LDA(n_components=number_topics, learning_decay=0.5, learning_offset=11, max_iter=15, n_jobs=-1)
  lda.fit(count_data)

  # Print the topics found by the LDA model
  #print("Topics found via LDA:")
  #print_topics(lda, count_vectorizer, number_words)  

  return lda

In [None]:
count_vectorizer = CountVectorizer(stop_words='english', min_df=5, max_df=0.8, ngram_range=(1, 1))
count_data = count_vectorizer.fit_transform(df['text_processed'])

#tfidf_vectorizer = TfidfVectorizer(**count_vectorizer.get_params())
#dtm_tfidf = tfidf_vectorizer.fit_transform(df['text_processed'])

Topic_Coherence = TopicCoherence()
Topic_Coherence.fit(count_vectorizer.vocabulary, count_data, True)

data_dense = count_data.todense()
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

Starting...
Di done
Dij done
pi and pij done
Sparsicity:  13.039283991639422 %


In [None]:
topics = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

metricsDF = pd.DataFrame(topics, columns=['nTopic'])
likelihoods = []
perplexities = []
coherences = []
params = []

#for topic in topics:
#  lda_tfidf = generate_topics(tfidf_vectorizer, dtm_tfidf, topic, 10)
#  joblib.dump(lda_tfidf, "/content/drive/My Drive/PedroAlmir/09_Doutorado/UFC/projetos/16_PhdProposal/review/topicModeling/models/td-idf-topics-{}.jl".format(topic))
#  preparedData = pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)
#  pyLDAvis.save_html(preparedData, "/content/drive/My Drive/PedroAlmir/09_Doutorado/UFC/projetos/16_PhdProposal/review/topicModeling/html/td-idf-topics-{}.html".format(topic))
#  classification = lda_tfidf.transform(dtm_tfidf)
#  topicsvalues = []
#  for topicnum in classification:
#    max_value = max(topicnum)
#    i, = np.where(np.isclose(topicnum, max_value))
#    topicsvalues.append(i[0] + 1)

#  df["topic-for-td-idf-{}-model".format(topic)] = topicsvalues

for topic in topics:
  lda = generate_topics(count_vectorizer, count_data, topic, 10)
  joblib.dump(lda, "/content/drive/My Drive/PedroAlmir/09_Doutorado/UFC/projetos/16_PhdProposal/review/topicModeling/models/frequency-topics-{}.jl".format(topic))
  preparedData = pyLDAvis.sklearn.prepare(lda, count_data, count_vectorizer)
  pyLDAvis.save_html(preparedData, "/content/drive/My Drive/PedroAlmir/09_Doutorado/UFC/projetos/16_PhdProposal/review/topicModeling/html/frequency-topics-{}.html".format(topic))
  classification = lda.transform(count_data)
  topicsvalues = []
  for topicnum in classification:
    max_value = max(topicnum)
    i, = np.where(np.isclose(topicnum, max_value))
    topicsvalues.append(i[0] + 1)

  df["topic-for-frequency-{}-model".format(topic)] = topicsvalues
  likelihoods.append(lda.score(count_data))
  perplexities.append(lda.perplexity(count_data))
  coherences.append(Topic_Coherence.myTCScore(lda))
  params.append(lda.get_params())

metricsDF['likelihood'] = likelihoods
metricsDF['perplexity'] = perplexities
metricsDF['tcoherence'] = coherences
metricsDF['params'] = params

df.to_csv(r'/content/drive/My Drive/PedroAlmir/09_Doutorado/UFC/projetos/16_PhdProposal/review/topicModeling/lda-result.csv', index = False)
metricsDF.to_csv(r'/content/drive/My Drive/PedroAlmir/09_Doutorado/UFC/projetos/16_PhdProposal/review/topicModeling/lda-metrics.csv', index = False)

  self._set_intXint(row, col, x.flat[0])


Mean both Score: 0.30

  self._set_intXint(row, col, x.flat[0])


Mean both Score: 0.31

  self._set_intXint(row, col, x.flat[0])


Mean both Score: 0.29

  self._set_intXint(row, col, x.flat[0])


Mean both Score: 0.31

  self._set_intXint(row, col, x.flat[0])


Mean both Score: 0.28

  self._set_intXint(row, col, x.flat[0])


Mean both Score: 0.28

  self._set_intXint(row, col, x.flat[0])


Mean both Score: 0.31

  self._set_intXint(row, col, x.flat[0])


Mean both Score: 0.29

  self._set_intXint(row, col, x.flat[0])


Mean both Score: 0.28

  self._set_intXint(row, col, x.flat[0])


Mean both Score: 0.30

  self._set_intXint(row, col, x.flat[0])


Mean both Score: 0.30

  self._set_intXint(row, col, x.flat[0])


Mean both Score: 0.26

  self._set_intXint(row, col, x.flat[0])


Mean both Score: 0.26