In [36]:
# https://www.tutorialspoint.com/gensim/gensim_introduction.htm#:~:text=Gensim%20%3D%20%E2%80%9CGenerate%20Similar%E2%80%9D%20is,Performing%20topic%20identification

In [37]:
#!pip install pandas
#!pip install nltk
#conda install -c conda-forge textblob
#!pip install textblob

In [38]:
#Gensim = “Generate Similar” is a popular open source natural language processing (NLP) library used for unsupervised topic modeling. It uses top academic models and modern statistical machine learning to perform various complex tasks such as −

#Building document or word vectors
#Corpora
#Performing topic identification
#Performing document comparison (retrieving semantically similar documents)
#Analysing plain-text documents for semantic structure

# In order to speed up processing and retrieval on machine clusters, Gensim provides efficient multicore implementations of various popular algorithms like 
# Latent Semantic Analysis (LSA), 
# Latent Dirichlet Allocation (LDA),
# Random Projections (RP), 
# Hierarchical Dirichlet Process (HDP).


In [39]:
from gensim import models

In [40]:
#dir(models)

In [41]:
#!pip install gensim
#pip install --upgrade gensim

In [42]:
from gensim.models import Word2Vec

In [43]:
import pprint
t_corpus = [
   "A survey of user opinion of computer system response time", 
   "Relation of user perceived response time to error measurement", 
   "The generation of random binary unordered trees", 
   "The intersection graph of paths in trees", 
   "Graph minors IV Widths of trees and well quasi ordering",
]
stoplist = set('for a of the and to in'.split(' '))
processed_corpus = [[word for word in document.lower().split() if word not in stoplist]
   for document in t_corpus]
	
pprint.pprint(processed_corpus)


[['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering']]


In [44]:
import gensim
gensim.utils.simple_preprocess(t_corpus[0], deacc=False, min_len=2, max_len=15)

['survey',
 'of',
 'user',
 'opinion',
 'of',
 'computer',
 'system',
 'response',
 'time']

In [45]:
from gensim import corpora
dictionary = corpora.Dictionary(processed_corpus)
print(dictionary)

Dictionary<25 unique tokens: ['computer', 'opinion', 'response', 'survey', 'system']...>


In [46]:
pprint.pprint(dictionary.token2id)

{'binary': 11,
 'computer': 0,
 'error': 7,
 'generation': 12,
 'graph': 16,
 'intersection': 17,
 'iv': 19,
 'measurement': 8,
 'minors': 20,
 'opinion': 1,
 'ordering': 21,
 'paths': 18,
 'perceived': 9,
 'quasi': 22,
 'random': 13,
 'relation': 10,
 'response': 2,
 'survey': 3,
 'system': 4,
 'time': 5,
 'trees': 14,
 'unordered': 15,
 'user': 6,
 'well': 23,
 'widths': 24}


In [47]:
BoW_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
pprint.pprint(BoW_corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(2, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)],
 [(11, 1), (12, 1), (13, 1), (14, 1), (15, 1)],
 [(14, 1), (16, 1), (17, 1), (18, 1)],
 [(14, 1), (16, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1)]]


In [48]:
from gensim import models
tfidf = models.TfidfModel(BoW_corpus)
words = "trees graph".lower().split()
print(tfidf[dictionary.doc2bow(words)])

[(14, 0.4869354917707381), (16, 0.8734379353188121)]


In [49]:
from gensim import similarities

In [50]:
index = similarities.SparseMatrixSimilarity(tfidf[BoW_corpus],num_features=5)

In [51]:
query_document = 'trees system'.split()

In [52]:
query_bow = dictionary.doc2bow(query_document)

In [53]:
query_bow

[(4, 1), (14, 1)]

In [54]:
#simils = index[tfidf[query_bow]]

In [55]:
#print(list(enumerate(simils)))

#for doc_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
#   print(doc_number, score)

In [56]:
#Model=models.LdaModel(corpus, id2word=dictionary, num_topics=100)

In [57]:
doc = [
   "CNTK formerly known as Computational Network Toolkit",
   "is a free easy-to-use open-source commercial-grade toolkit",
   "that enable us to train deep learning algorithms to learn like the human brain."
]

text_tokens = [[text for text in doc.split()] for doc in doc]

dict_LoS = corpora.Dictionary(text_tokens)

print(dict_LoS)

Dictionary<27 unique tokens: ['CNTK', 'Computational', 'Network', 'Toolkit', 'as']...>


In [58]:
print(dict_LoS.token2id)

{'CNTK': 0, 'Computational': 1, 'Network': 2, 'Toolkit': 3, 'as': 4, 'formerly': 5, 'known': 6, 'a': 7, 'commercial-grade': 8, 'easy-to-use': 9, 'free': 10, 'is': 11, 'open-source': 12, 'toolkit': 13, 'algorithms': 14, 'brain.': 15, 'deep': 16, 'enable': 17, 'human': 18, 'learn': 19, 'learning': 20, 'like': 21, 'that': 22, 'the': 23, 'to': 24, 'train': 25, 'us': 26}


In [59]:
import gensim
from gensim import corpora
from pprint import pprint
from gensim.utils import simple_preprocess
from smart_open import smart_open
import os

dict_STF = corpora.Dictionary(
   simple_preprocess(line, deacc =True) for line in open('./data/doc.txt', encoding='utf-8')
)

print(dict_STF.token2id)

{'as': 0, 'cntk': 1, 'computational': 2, 'formerly': 3, 'known': 4, 'network': 5, 'toolkit': 6, 'commercial': 7, 'easy': 8, 'free': 9, 'grade': 10, 'is': 11, 'open': 12, 'source': 13, 'to': 14, 'use': 15, 'algorithms': 16, 'brain': 17, 'deep': 18, 'enable': 19, 'human': 20, 'learn': 21, 'learning': 22, 'like': 23, 'that': 24, 'the': 25, 'train': 26, 'us': 27}


In [60]:
class Read_files(object):
    def __init__(self, directoryname):
        self.directoryname = directoryname
    def __iter__(self):
        for fname in os.listdir(self.directoryname):
            for line in open(os.path.join(self.directoryname, fname), encoding='latin'):
                yield simple_preprocess(line)

In [61]:
path = "data"
dict_MUL = corpora.Dictionary(Read_files(path))
print(dict_MUL.token2id)

{'as': 0, 'cntk': 1, 'computational': 2, 'formerly': 3, 'known': 4, 'network': 5, 'toolkit': 6, 'commercial': 7, 'easy': 8, 'free': 9, 'grade': 10, 'is': 11, 'open': 12, 'source': 13, 'to': 14, 'use': 15, 'algorithms': 16, 'brain': 17, 'deep': 18, 'enable': 19, 'human': 20, 'learn': 21, 'learning': 22, 'like': 23, 'that': 24, 'the': 25, 'train': 26, 'us': 27}


# BoW

In [62]:
import gensim
import pprint
from gensim import corpora
from gensim.utils import simple_preprocess

In [63]:
doc_list = [
"Hello, how are you?", "How do you do?",
"Hey what are you doing? yes you What are you doing?"
]

doc_tokenized = [simple_preprocess(doc) for doc in doc_list]

dictionary = corpora.Dictionary()

BoW_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in doc_tokenized]

In [64]:
id_words = [[(dictionary[id], count) for id, count in line] for line in BoW_corpus]
print(id_words)

[[('are', 1), ('hello', 1), ('how', 1), ('you', 1)], [('how', 1), ('you', 1), ('do', 2)], [('are', 2), ('you', 3), ('doing', 2), ('hey', 1), ('what', 2), ('yes', 1)]]


In [65]:
import gensim
import pprint
from gensim import corpora
from gensim.utils import simple_preprocess
doc_list = [
   "Hello, how are you?", "How do you do?", 
   "Hey what are you doing? yes you What are you doing?"
]
doc_tokenized = [simple_preprocess(doc) for doc in doc_list]
dictionary = corpora.Dictionary()
BoW_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in doc_tokenized]
print(BoW_corpus)
id_words = [[(dictionary[id], count) for id, count in line] for line in BoW_corpus]
print(id_words)

[[(0, 1), (1, 1), (2, 1), (3, 1)], [(2, 1), (3, 1), (4, 2)], [(0, 2), (3, 3), (5, 2), (6, 1), (7, 2), (8, 1)]]
[[('are', 1), ('hello', 1), ('how', 1), ('you', 1)], [('how', 1), ('you', 1), ('do', 2)], [('are', 2), ('you', 3), ('doing', 2), ('hey', 1), ('what', 2), ('yes', 1)]]


In [66]:
doc_tokenized = [
   simple_preprocess(line, deacc =True) for line in open(path+'/doc.txt', encoding='utf-8')
]
dictionary = corpora.Dictionary()


BoW_corpus = [
   dictionary.doc2bow(doc, allow_update=True) for doc in doc_tokenized
]
print(BoW_corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)], [(6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1)], [(14, 2), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1)]]


In [67]:
doc_tokenized = [
   simple_preprocess(line, deacc =True) for line in open('./data/doc.txt', encoding='utf-8')
]
dictionary = corpora.Dictionary()
BoW_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in doc_tokenized]
print(BoW_corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)], [(6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1)], [(14, 2), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1)]]


In [68]:
from gensim import models

In [69]:

tfidf = models.TfidfModel(BoW_corpus)

doc_BoW = [(1,1),(3,1)]
print(tfidf[doc_BoW])

[(1, 0.7071067811865475), (3, 0.7071067811865475)]


In [70]:
corpus_tfidf = tfidf[BoW_corpus]
for doc in corpus_tfidf:
   print(doc)

[(0, 0.40369167389095173), (1, 0.40369167389095173), (2, 0.40369167389095173), (3, 0.40369167389095173), (4, 0.40369167389095173), (5, 0.40369167389095173), (6, 0.1489905855640844)]
[(6, 0.12831948188497175), (7, 0.34768308506769946), (8, 0.34768308506769946), (9, 0.34768308506769946), (10, 0.34768308506769946), (11, 0.34768308506769946), (12, 0.34768308506769946), (13, 0.34768308506769946), (14, 0.12831948188497175), (15, 0.34768308506769946)]
[(14, 0.20840410544601642), (16, 0.2823366384349904), (17, 0.2823366384349904), (18, 0.2823366384349904), (19, 0.2823366384349904), (20, 0.2823366384349904), (21, 0.2823366384349904), (22, 0.2823366384349904), (23, 0.2823366384349904), (24, 0.2823366384349904), (25, 0.2823366384349904), (26, 0.2823366384349904), (27, 0.2823366384349904)]


In [71]:
Model=models.TfidfModel(corpus_tfidf, normalize=True)

In [72]:
Model=models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300)

In [73]:
Model=models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=100)

In [74]:
Model=models.RpModel(corpus_tfidf, num_topics=500)

In [75]:
Model=models.HdpModel(corpus_tfidf, id2word=dictionary)

In [76]:
import gensim
import pprint
from gensim import corpora
from gensim.utils import simple_preprocess
doc_list = [
   "Hello, how are you?", "How do you do?", 
   "Hey what are you doing? yes you What are you doing?"
]
doc_tokenized = [simple_preprocess(doc) for doc in doc_list]
dictionary = corpora.Dictionary()
BoW_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in doc_tokenized]
for doc in BoW_corpus:
   print([[dictionary[id], freq] for id, freq in doc])
import numpy as np
tfidf = models.TfidfModel(BoW_corpus, smartirs='ntc')
for doc in tfidf[BoW_corpus]:
   print([[dictionary[id], np.around(freq,decimals=2)] for id, freq in doc])

[['are', 1], ['hello', 1], ['how', 1], ['you', 1]]
[['how', 1], ['you', 1], ['do', 2]]
[['are', 2], ['you', 3], ['doing', 2], ['hey', 1], ['what', 2], ['yes', 1]]
[['are', 0.4], ['hello', 0.81], ['how', 0.4], ['you', 0.17]]
[['how', 0.24], ['you', 0.1], ['do', 0.97]]
[['are', 0.3], ['you', 0.18], ['doing', 0.59], ['hey', 0.3], ['what', 0.59], ['yes', 0.3]]


# Topic Model

In [77]:
#Model=models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100)
#Model=models.HdpModel(corpus_tfidf, id2word=dictionary)
#Model=models.HdpModel(corpus_tfidf, id2word=dictionary)


In [79]:
import spacy

In [80]:
import nltk;
nltk.download('stopwords')
nlp = spacy.load('en_core_web_md', disable=['parser', 'ner'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [82]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')

In [85]:
#newsgroups_train.data[:4]

In [90]:
#!pip install pyLDAvis

NameError: name 'pyLDAvis' is not defined

In [95]:
!pip install matplotlib

Collecting matplotlib
  Downloading matplotlib-3.5.2-cp37-cp37m-win_amd64.whl (7.2 MB)
Collecting pillow>=6.2.0
  Downloading Pillow-9.1.1-cp37-cp37m-win_amd64.whl (3.3 MB)
Collecting cycler>=0.10
  Using cached cycler-0.11.0-py3-none-any.whl (6.4 kB)
Collecting fonttools>=4.22.0
  Downloading fonttools-4.33.3-py3-none-any.whl (930 kB)
Collecting kiwisolver>=1.0.1
  Downloading kiwisolver-1.4.2-cp37-cp37m-win_amd64.whl (54 kB)
Installing collected packages: pillow, kiwisolver, fonttools, cycler, matplotlib
Successfully installed cycler-0.11.0 fonttools-4.33.3 kiwisolver-1.4.2 matplotlib-3.5.2 pillow-9.1.1


In [99]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
#import pyLDAvis
#import pyLDAvis.gensim
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train')
data = newsgroups_train.data
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
data = [re.sub('\s+', ' ', sent) for sent in data]
data = [re.sub("\'", "", sent) for sent in data]
#print(data[:4]) #it will print the data after prepared for stopwords
bigram = gensim.models.Phrases(data, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data], threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

def remove_stopwords(texts):
   return [[word for word in simple_preprocess(str(doc)) 
   if word not in stop_words] for doc in texts]

def make_bigrams(texts):
   return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
   [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
   texts_out = []
   for sent in texts:
      doc = nlp(" ".join(sent))
      texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
   return texts_out



In [100]:
data_words_nostops = remove_stopwords(data)
data_words_bigrams = make_bigrams(data_words_nostops)
nlp = spacy.load('en_core_web_md', disable=['parser', 'ner'])
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=[
   'NOUN', 'ADJ', 'VERB', 'ADV'
])
print(data_lemmatized[:4]) #it will print the lemmatized data.
id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]
print(corpus[:4]) #it will print the corpus we created above.
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:4]] 
#it will print the words with their frequencies.
lda_model = gensim.models.ldamodel.LdaModel(
   corpus=corpus, id2word=id2word, num_topics=20, random_state=100, 
   update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True)

[['s', 'thing', 'car', 'nntp', 'post', 'host', 'park', 'line', 'wonder', 'enlighten', 'car', 'see', 'day', 'door', 'sport', 'car', 'look', 'late', 'early', 'call', 'bricklin', 'door', 'really', 'small', 'addition', 'front', 'bumper', 'separate', 'rest', 'body', 'know', 'tellme', 'model', 'name', 'engine', 'spec', 'year', 'production', 'car', 'make', 'history', 'info', 'funky', 'look', 'car', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst'], ['si', 'clock', 'poll', 'final', 'call', 'summary', 'final', 'call', 'si', 'clock', 'report', 'keyword', 'si', 'acceleration', 'clock', 'upgrade', 'article', 'shelley', 'line', 'nntp', 'post', 'host', 'fair', 'number', 'brave', 'soul', 'upgrade', 'si', 'clock', 'oscillator', 'share', 'experience', 'poll', 'send', 'brief', 'message', 'detailing', 'experience', 'procedure', 'top', 'speed', 'attain', 'cpu', 'rate', 'speed', 'add', 'card', 'adapter', 'heat', 'sink', 'hour', 'usage', 'day', 'floppy', 'disk', 'functionality', 'floppy', 'especially', 'r

In [None]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

print('\nPerplexity: ', lda_model.log_perplexity(corpus))

coherence_model_lda = CoherenceModel(
   model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v'
)
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [None]:
import os
from gensim.models.wrappers import LdaMallet
os.environ.update({'MALLET_HOME':r'C:/mallet-2.0.8/'}) 
#You should update this path as per the path of Mallet directory on your system.
mallet_path = r'C:/mallet-2.0.8/bin/mallet' 
#You should update this path as per the path of Mallet directory on your system.

In [None]:
ldamallet = gensim.models.wrappers.LdaMallet(
   mallet_path, corpus=corpus, num_topics=20, id2word=id2word
)
pprint(ldamallet.show_topics(formatted=False))

In [None]:
ldamallet = gensim.models.wrappers.LdaMallet(
   mallet_path, corpus=corpus, num_topics=20, id2word=id2word
)
pprint(ldamallet.show_topics(formatted=False))

In [None]:
def coherence_values_computation(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(
            mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word
        )
        model_list.append(model)
    coherencemodel = CoherenceModel(
        model=model, texts=texts, dictionary=dictionary, coherence='c_v'
    )
    coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

In [None]:
model_list, coherence_values = coherence_values_computation (
   dictionary=id2word, corpus=corpus, texts=data_lemmatized, 
   start=1, limit=50, step=8
)
limit=50; start=1; step=8;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
for m, cv in zip(x, coherence_values):
   print("Num Topics =", m, " is having Coherence Value of", round(cv, 4))

In [None]:
optimal_model = model_list[3]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

# Finding dominant topics in sentences

In [None]:
def dominant_topics(ldamodel=lda_model, corpus=corpus, texts=data):
sent_topics_df = pd.DataFrame()

In [None]:
for i, row in enumerate(ldamodel[corpus]):
    row = sorted(row, key=lambda x: (x[1]), reverse=True)

In [None]:
for j, (topic_num, prop_topic) in enumerate(row):
    if j == 0: # => dominant topic
        wp = ldamodel.show_topic(topic_num)
        topic_keywords = ", ".join([word for word, prop in wp])
        sent_topics_df = sent_topics_df.append(
        pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
    else:
        break
    
sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

In [None]:
contents = pd.Series(texts)
sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
   return(sent_topics_df)
df_topic_sents_keywords = dominant_topics(
   ldamodel=optimal_model, corpus=corpus, texts=data
)

In [None]:
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = [
'Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text'
]

In [None]:
def coherence_values_computation(dictionary, corpus, texts, limit, start=2, step=3):
   coherence_values = []
   model_list = []
   for num_topics in range(start, limit, step):
      model = gensim.models.wrappers.LdaMallet(
         mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word
      )
      model_list.append(model)
   coherencemodel = CoherenceModel(
      model=model, texts=texts, dictionary=dictionary, coherence='c_v'
   )
   coherence_values.append(coherencemodel.get_coherence())
return model_list, coherence_values

In [None]:
model_list, coherence_values = coherence_values_computation (
   dictionary=id2word, corpus=corpus, texts=data_lemmatized, 
   start=1, limit=50, step=8
)
limit=50; start=1; step=8;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
for m, cv in zip(x, coherence_values):
   print("Num Topics =", m, " is having Coherence Value of", round(cv, 4))

In [None]:
optimal_model = model_list[3]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

In [None]:
def dominant_topics(ldamodel=lda_model, corpus=corpus, texts=data):
    sent_topics_df = pd.DataFrame()

In [None]:
for i, row in enumerate(ldamodel[corpus]):
    row = sorted(row, key=lambda x: (x[1]), reverse=True)

In [None]:
for j, (topic_num, prop_topic) in enumerate(row):
   if j == 0: # => dominant topic
      wp = ldamodel.show_topic(topic_num)
      topic_keywords = ", ".join([word for word, prop in wp])
sent_topics_df = sent_topics_df.append(
   pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True
)
   else:
      break
sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

In [None]:
contents = pd.Series(texts)
   sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
   return(sent_topics_df)
df_topic_sents_keywords = dominant_topics(
   ldamodel=optimal_model, corpus=corpus, texts=data
)

In [None]:
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = [
'Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text'
]

# Finding Most Representative Document

In [None]:
sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')
for i, grp in sent_topics_outdf_grpd:
   sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet,
grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], axis=0)
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)
sent_topics_sorteddf_mallet.columns = [
   'Topic_Number', "Contribution_Perc", "Keywords", "Text"
]
sent_topics_sorteddf_mallet.head()

# Volume & Distribution of Topics

In [None]:
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

In [None]:
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

In [None]:
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

In [None]:
df_dominant_topics = pd.concat(
[topic_num_keywords, topic_counts, topic_contribution], axis=1
)

In [None]:
df_dominant_topics.columns = [
   'Dominant-Topic', 'Topic-Keywords', 'Num_Documents', 'Perc_Documents'
]
df_dominant_topics

# LSI & HDP Topic Modeling

In [None]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')
data = newsgroups_train.data
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
data = [re.sub('\s+', ' ', sent) for sent in data]
data = [re.sub("\'", "", sent) for sent in data]

#print(data_words[:4]) #it will print the data after prepared for stopwords
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

def remove_stopwords(texts):
   return [[word for word in simple_preprocess(str(doc)) 
   if word not in stop_words] for doc in texts]

def make_bigrams(texts):
   return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
   return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
   texts_out = []
   for sent in texts:
      doc = nlp(" ".join(sent))
      texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
   return texts_out



In [None]:
data_words_nostops = remove_stopwords(data_words)
data_words_bigrams = make_bigrams(data_words_nostops)
nlp = spacy.load('en_core_web_md', disable=['parser', 'ner'])
data_lemmatized = lemmatization(
   data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
)
print(data_lemmatized[:4]) #it will print the lemmatized data.

id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]

print(corpus[:4]) #it will print the corpus we created above.
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:4]] 


In [None]:
#it will print the words with their frequencies.
lsi_model = gensim.models.lsimodel.LsiModel(
   corpus=corpus, id2word=id2word, num_topics=20,chunksize=100
)

pprint(lsi_model.print_topics())
doc_lsi = lsi_model[corpus]



In [None]:
Hdp_model = gensim.models.hdpmodel.HdpModel(corpus=corpus, id2word=id2word)
pprint(Hdp_model.print_topics())

# Developing Word2Vec Embedding

In [108]:
from gensim.models import Word2Vec
sentences = [
   ['this', 'is', 'gensim', 'tutorial', 'for', 'free'],
   ['this', 'is', 'the', 'tutorials' 'point', 'website'],
   ['you', 'can', 'read', 'technical','tutorials', 'for','free'],
   ['we', 'are', 'implementing','word2vec'],
   ['learn', 'full', 'gensim', 'tutorial']
]
model = Word2Vec(sentences, min_count=1)

In [125]:

print(model)
words = list(model.wv.key_to_index.keys())
print(words)

#print(model['tutorial'])
model.save('model.bin')
new_model = Word2Vec.load('model.bin')
print(new_model)



Word2Vec<vocab=20, vector_size=100, alpha=0.025>
['this', 'is', 'gensim', 'tutorial', 'for', 'free', 'learn', 'the', 'tutorialspoint', 'website', 'full', 'can', 'read', 'technical', 'tutorials', 'we', 'are', 'implementing', 'word2vec', 'you']
Word2Vec<vocab=20, vector_size=100, alpha=0.025>


In [154]:
#model = Word2Vec(sentences, min_count=1)
sent_vec=[]
import numpy as np
for sent in sentences:
    sents=[]
    for word in sent:
        word_vec = model.wv.get_vector(word)
        sents.append(word_vec)
    sent_vec.append(np.array(sents).mean(axis=1))


In [158]:
!pip uninstall PCA

In [None]:
from sklearn.decomposition import PCA

In [156]:
model = Word2Vec(sentences, min_count=1)
import PCA
X = sent_vec #model[words]
pca = PCA(n_components=2)
result = pca.fit_transform(X)
pyplot.scatter(result[:, 0], result[:, 1])
words = list(model.wv.vocab)
for i, word in enumerate(words):
    pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))

pyplot.show()

NameError: name 'PCA' is not defined

# Creating Document Vectors Using Doc2Vec

In [None]:
import gensim
import gensim.downloader as api
dataset = api.load("text8")
data = [d for d in dataset]
def tagged_document(list_of_list_of_words):
   for i, list_of_words in enumerate(list_of_list_of_words):
      yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])

In [None]:
data_for_training = list(tagged_document(data))
#print(data_for_training[:1])
model = gensim.models.doc2vec.Doc2Vec(vector_size=40, min_count=2, epochs=30)

In [103]:
model.build_vocab(data_for_training)
model.train(data_for_training, total_examples=model.corpus_count, epochs=model.epochs)
print(model.infer_vector(['violent', 'means', 'to', 'destroy', 'the','organization'])) 


[-0.04645032 -0.06525037 -0.44830814  0.0735919  -0.01256045 -0.19441895
  0.00240697  0.04283302 -0.08887281  0.06730043  0.08412008 -0.02412098
 -0.29227853 -0.25381443 -0.25344747  0.09784282 -0.03148327  0.01436284
 -0.27366793 -0.08353205 -0.09285121  0.06544829 -0.10899279  0.1396095
 -0.09804571 -0.09945874 -0.15530185 -0.06962255  0.09758286 -0.02589562
  0.03024218  0.05621434 -0.36436296 -0.5566834  -0.22413267 -0.21288651
 -0.12768608 -0.2614937  -0.0357865  -0.11282396]


In [None]:
Bigger_list=[]
for i in df['patterns']:
    li = list(i.split(" "))
    Bigger_list.append(li)
Model= Word2Vec(Bigger_list,min_count=1,size=300,workers=4)


Model.save("word2vec.model")
Model.save("model.bin")

model = Word2Vec.load('model.bin')

vocab = list(model.wv.vocab)
vocab


similar_words = model.most_similar('thanks')
print(similar_words)


dissimlar_words = model.doesnt_match('See you later, thanks for visiting'.split())
print(dissimlar_words)


similarity_two_words = model.similarity('please','see')
print("Please provide the similarity between these two words:")
print(similarity_two_words)


similar = model.similar_by_word('kind')
print(similar)
