In [11]:
# References:
#https://stackabuse.com/python-for-nlp-topic-modeling/

In [1]:
import pandas as pd
documents=pd.read_excel("../CannaConnect/Dataset/description_clean.xlsx")
documents=documents.Description.astype(str)
documents.head()

0     og   hybrid pack strong punch name supposedly...
1     aloha white widow especially potent cut white...
2     sativa hybrid bred spain medical seed co bree...
3     dawgs hybrid g chemdawg genetics bred canadia...
4    kosher tangie k gold  indica hybrid combine le...
Name: Description, dtype: object

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=5, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=5, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

In [9]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

no_topics = 15

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

In [12]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        print('\n')

no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
indica purple bud hybrid sweet aroma body seed plant sativa


Topic 1:
og lemon kush sfv pine indica body potent hybrid tahoe


Topic 2:
diesel sour sativa fuel nyc coast chemdawg east hybrid uplifting


Topic 3:
cup cannabis place took denver st nd time category won


Topic 4:
dream blue sativa hybrid crack green daytime colorado stress euphoria


Topic 5:
cbd thc high ratio content inflammation pain patient psychoactive harlequin


Topic 6:
blueberry berry blue dj short indica sweet flavor hybrid parent


Topic 7:
cooky girl scout cookie cherry gsc pie platinum doughy mint


Topic 8:
white widow hybrid resin snow shark trichome seed rhino euphoria


Topic 9:
kush bubba master indica pre hindu alien relaxing purple gupta


Topic 10:
haze sativa silver super spicy cerebral light mango amnesia seed


Topic 11:
orange hair bright citrus green bud dense pistil juice tangerine


Topic 12:
jack herer sativa ripper lemon creativity uplifting citrus combine patient


Topic 13:
region