In [11]:
# References:
#https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730

In [1]:
import pandas as pd  
import numpy as np
documents=pd.read_excel("../CannaConnect/Dataset/description_clean.xlsx")
documents=documents.Description.astype(str)
documents.head()

0     og   hybrid pack strong punch name supposedly...
1     aloha white widow especially potent cut white...
2     sativa hybrid bred spain medical seed co bree...
3     dawgs hybrid g chemdawg genetics bred canadia...
4    kosher tangie k gold  indica hybrid combine le...
Name: Description, dtype: object

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(max_df=0.95,min_df=5, stop_words='english')  
doc_term_matrix = count_vect.fit_transform(documents.values.astype('U'))
doc_term_matrix

<2277x2245 sparse matrix of type '<class 'numpy.int64'>'
	with 76028 stored elements in Compressed Sparse Row format>

In [3]:
from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=10, random_state=42)  
LDA.fit(doc_term_matrix)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=10, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [4]:
import random

for i in range(10):  
    random_id = random.randint(0,len(count_vect.get_feature_names()))
    print(count_vect.get_feature_names()[random_id])

hold
intended
pre
sandalwood
fade
bc
vigorous
fan
tonic
area


In [5]:
first_topic = LDA.components_[0]
top_topic_words = first_topic.argsort()[-10:]
for i in top_topic_words:  
    print(count_vect.get_feature_names()[i])

chemdawg
sativa
sweet
high
aroma
hybrid
thc
diesel
cbd
sour


In [6]:
for i,topic in enumerate(LDA.components_):  
    print(f'Top 10 words for topic #{i}:')
    print([count_vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['chemdawg', 'sativa', 'sweet', 'high', 'aroma', 'hybrid', 'thc', 'diesel', 'cbd', 'sour']


Top 10 words for topic #1:
['day', 'space', 'hybrid', 'citrus', 'aroma', 'lemon', 'genetics', 'seed', 'haze', 'sativa']


Top 10 words for topic #2:
['best', 'hybrid', 'time', 'og', 'high', 'kush', 'place', 'indica', 'cannabis', 'cup']


Top 10 words for topic #3:
['cooky', 'indica', 'body', 'aroma', 'og', 'hybrid', 'terpene', 'white', 'consumer', 'physical']


Top 10 words for topic #4:
['week', 'sativa', 'aroma', 'cheese', 'flowering', 'bud', 'seed', 'hybrid', 'plant', 'indica']


Top 10 words for topic #5:
['dream', 'indica', 'aroma', 'blueberry', 'sweet', 'seed', 'hybrid', 'blue', 'sativa', 'haze']


Top 10 words for topic #6:
['genetics', 'aroma', 'bubba', 'sweet', 'body', 'bud', 'grape', 'indica', 'kush', 'purple']


Top 10 words for topic #7:
['sativa', 'relief', 'hybrid', 'sweet', 'skunk', 'thc', 'indica', 'high', 'patient', 'pain']


Top 10 words for topic #8

In [7]:
topic_values = LDA.transform(doc_term_matrix)  
topic_values.shape

(2277, 10)

# NMF for Topic Modeling in Python

In [9]:
import pandas as pd  
import numpy as np
documents=pd.read_excel("../CannaConnect/Dataset/description_clean.xlsx")
documents=documents.Description.astype(str)
documents.head()

0     og   hybrid pack strong punch name supposedly...
1     aloha white widow especially potent cut white...
2     sativa hybrid bred spain medical seed co bree...
3     dawgs hybrid g chemdawg genetics bred canadia...
4    kosher tangie k gold  indica hybrid combine le...
Name: Description, dtype: object

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(max_df=0.95, min_df=5, stop_words='english')  
doc_term_matrix = tfidf_vect.fit_transform(documents.values.astype('U'))

In [12]:
from sklearn.decomposition import NMF

nmf = NMF(n_components=10, random_state=42)  
nmf.fit(doc_term_matrix )

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=10, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [13]:
import random

for i in range(10):  
    random_id = random.randint(0,len(tfidf_vect.get_feature_names()))
    print(tfidf_vect.get_feature_names()[random_id])

mental
conic
scout
set
lsd
nina
le
nearly
bloom
martian


In [14]:
first_topic = nmf.components_[0]  
top_topic_words = first_topic.argsort()[-10:]

In [15]:
for i in top_topic_words:  
    print(tfidf_vect.get_feature_names()[i])

green
dark
bud
granddaddy
hue
urkle
deep
indica
grape
purple


In [16]:
for i,topic in enumerate(nmf.components_):  
    print(f'Top 10 words for topic #{i}:')
    print([tfidf_vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['green', 'dark', 'bud', 'granddaddy', 'hue', 'urkle', 'deep', 'indica', 'grape', 'purple']


Top 10 words for topic #1:
['potent', 'kush', 'hybrid', 'cannabis', 'body', 'pine', 'sfv', 'indica', 'lemon', 'og']


Top 10 words for topic #2:
['east', 'coast', 'uplifting', 'nyc', 'chemdawg', 'hybrid', 'fuel', 'sativa', 'sour', 'diesel']


Top 10 words for topic #3:
['mango', 'spicy', 'uplifting', 'cerebral', 'lemon', 'jack', 'super', 'silver', 'sativa', 'haze']


Top 10 words for topic #4:
['short', 'flavor', 'dj', 'sativa', 'sweet', 'hybrid', 'berry', 'blueberry', 'dream', 'blue']


Top 10 words for topic #5:
['inflammation', 'took', 'ratio', 'content', 'place', 'cannabis', 'cup', 'high', 'thc', 'cbd']


Top 10 words for topic #6:
['high', 'flower', 'yield', 'week', 'seed', 'flowering', 'skunk', 'indica', 'bud', 'plant']


Top 10 words for topic #7:
['enjoy', 'body', 'sweet', 'terpene', 'cherry', 'scout', 'girl', 'physical', 'consumer', 'cooky']


Top 10 words f