In [1]:
#https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21

In [2]:
import pandas as pd
df_review=pd.read_excel("../CannaConnect/Dataset/description_clean.xlsx")
documents=df_review.Description.astype(str)
documents.head()

0     og   hybrid strain pack strong punch name sup...
1     aloha white widow especially potent cut white...
2     sativa dominant hybrid bred spain medical see...
3     dawgs hybrid g chemdawg genetics bred canadia...
4    known kosher tangie k gold  indica dominant hy...
Name: Description, dtype: object

In [3]:
import spacy
spacy.load('en_core_web_sm')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [4]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dastous\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dastous\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [6]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [7]:
import random
text_data = []
for line in documents:
    tokens = prepare_text_for_lda(line)
    if random.random() > .99:
        print(tokens)
        text_data.append(tokens)

['appleberry', 'indica', 'dominant', 'yield', 'machine', 'dynamite', 'bubblelicious', 'white', 'widow', 'strain', 'classic', 'central', 'asian', 'central', 'american', 'express', 'terpene', 'profile', 'strain', 'aroma', 'exemplify', 'title', 'smelling', 'subtle', 'apple', 'berry', 'effect', 'extremely', 'relax', 'border', 'sedative', 'imbue', 'weight', 'utilize', 'appleberry', 'anxiolytic', 'around', 'relief', 'appleberry', 'place', 'category', 'highlife', 'amsterdam']
['bettie', 'grow', 'liberty', 'reach', 'washington', 'perfectly', 'balance', 'hybrid', 'cannabis', 'strain', 'strain', 'provide', 'mellow', 'euphoria', 'focus', 'cerebral', 'space', 'encourage', 'energy', 'rather', 'lethargy']
['shark', 'barney', 'flavorful', 'cross', 'shark', 'cheese', 'strain', 'together', 'offer', 'consumer', 'content', 'reduce', 'inflammation', 'improve', 'alleviate', 'stress', 'shark', 'complex', 'floral', 'bouquet', 'contain', 'jasmine', 'berry', 'cheese', 'making', 'unique', 'challenge', 'flavor',

In [8]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [9]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.016*"plant" + 0.015*"sativa" + 0.009*"cannabis" + 0.009*"grower"')
(1, '0.056*"diesel" + 0.025*"strain" + 0.018*"strawberry" + 0.018*"hybrid"')
(2, '0.027*"strain" + 0.023*"indica" + 0.018*"effect" + 0.017*"dominant"')
(3, '0.041*"strain" + 0.018*"effect" + 0.018*"dream" + 0.015*"genetics"')
(4, '0.017*"effect" + 0.017*"strain" + 0.013*"cross" + 0.013*"candy"')


In [10]:
new_doc = 'Practical Bayesian Optimization of Machine Learning Algorithms'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[(23, 1)]
[(0, 0.10006686), (1, 0.10004021), (2, 0.59982383), (3, 0.100029595), (4, 0.10003947)]


In [11]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.022*"strain" + 0.011*"indica" + 0.011*"chocolate" + 0.011*"querkle"')
(1, '0.029*"strain" + 0.020*"diesel" + 0.017*"effect" + 0.015*"dream"')
(2, '0.027*"strain" + 0.022*"effect" + 0.016*"hybrid" + 0.013*"purple"')


In [12]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.031*"critical" + 0.024*"elephantizer" + 0.024*"tranquil" + 0.016*"indica"')
(1, '0.002*"strain" + 0.002*"elephantizer" + 0.002*"tranquil" + 0.002*"plant"')
(2, '0.029*"strain" + 0.029*"strawberry" + 0.022*"headbanger" + 0.022*"diesel"')
(3, '0.050*"brain" + 0.026*"indica" + 0.026*"dominant" + 0.013*"dutch"')
(4, '0.027*"diesel" + 0.027*"effect" + 0.021*"strain" + 0.014*"sweet"')
(5, '0.033*"strain" + 0.020*"stress" + 0.020*"chocolate" + 0.014*"effect"')
(6, '0.036*"diesel" + 0.027*"strain" + 0.027*"candy" + 0.018*"sativa"')
(7, '0.037*"dream" + 0.037*"strain" + 0.031*"super" + 0.019*"heavy"')
(8, '0.053*"diesel" + 0.027*"hybrid" + 0.014*"strain" + 0.014*"indica"')
(9, '0.036*"strain" + 0.016*"purple" + 0.016*"effect" + 0.016*"candyland"')


In [14]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [15]:
lda3 = gensim.models.ldamodel.LdaModel.load('model3.gensim')
lda_display3 = pyLDAvis.gensim.prepare(lda3, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display3)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [16]:
lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')
lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display10)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
