In [1]:
# References:
#https://stackabuse.com/python-for-nlp-topic-modeling/

In [2]:
import pandas as pd
documents=pd.read_excel("../CannaConnect/Dataset/description_clean.xlsx")
documents=documents.Description.astype(str)
documents.head()

0     og   hybrid strain pack strong punch name sup...
1     aloha white widow especially potent cut white...
2     sativa dominant hybrid bred spain medical see...
3     dawgs hybrid g chemdawg genetics bred canadia...
4    known kosher tangie k gold  indica dominant hy...
Name: Description, dtype: object

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

In [4]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

no_topics = 10

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)



In [5]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)


Topic 0:
strain indica effect purple bud hybrid sweet aroma dominant kush
Topic 1:
nan zesty finish focused focus flying fluffy flowering flower floral
Topic 2:
og kush lemon indica sfv pine body strain alien potent
Topic 3:
diesel sour sativa fuel nyc coast chemdawg east hybrid cross
Topic 4:
cup cannabis place took denver st nd time strain category
Topic 5:
haze sativa silver super cerebral spicy light seed flowering strain
Topic 6:
cbd thc high content ratio inflammation pain patient strain psychoactive
Topic 7:
blue dream blueberry berry hybrid dj dominant sweet short sativa
Topic 8:
cooky girl scout cookie cherry gsc pie platinum cross doughy
Topic 9:
white widow hybrid resin snow seed shark trichome dominant rhino


In [6]:
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
strain effect thc cbd high sativa hybrid jack skunk sweet
Topic 1:
grape purps ape mendocino thunder urkle lavender friend headed clear
Topic 2:
cheese indica outdoor harvest strain gift indoors humboldt october flowering
Topic 3:
strain og kush effect indica hybrid aroma dominant body cross
Topic 4:
strain purple bud sativa sour diesel hybrid flower orange plant
Topic 5:
haze sativa strain seed effect hybrid time flowering cannabis bred
Topic 6:
strawberry seed cup hawaiian st widow award winning white place
Topic 7:
bubba nan pre nina kush blackberry limone colorado coffee described
Topic 8:
gold landrace region gorilla glue strain world sativa variety indigenous
Topic 9:
rare calyx la poison big purple durban cherry bud dankness
