In [1]:
# References:
#https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730

In [2]:
import pandas as pd  
import numpy as np
documents=pd.read_excel("../CannaConnect/Dataset/description_clean.xlsx")
documents=documents.Description.astype(str)
documents.head()

0     og   hybrid strain pack strong punch name sup...
1     aloha white widow especially potent cut white...
2     sativa dominant hybrid bred spain medical see...
3     dawgs hybrid g chemdawg genetics bred canadia...
4    known kosher tangie k gold  indica dominant hy...
Name: Description, dtype: object

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')  
doc_term_matrix = count_vect.fit_transform(documents.values.astype('U'))
doc_term_matrix

<2349x4265 sparse matrix of type '<class 'numpy.int64'>'
	with 86680 stored elements in Compressed Sparse Row format>

In [4]:
from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=10, random_state=42)  
LDA.fit(doc_term_matrix)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=42, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [5]:
import random

for i in range(10):  
    random_id = random.randint(0,len(count_vect.get_feature_names()))
    print(count_vect.get_feature_names()[random_id])

brazilian
internodal
motivates
lavender
scout
gorgeous
bloom
skunkiness
stain
ghost


In [6]:
first_topic = LDA.components_[0]
top_topic_words = first_topic.argsort()[-10:]
for i in top_topic_words:  
    print(count_vect.get_feature_names()[i])

indica
dominant
cross
thc
aroma
hybrid
effect
cbd
cheese
strain


In [7]:
for i,topic in enumerate(LDA.components_):  
    print(f'Top 10 words for topic #{i}:')
    print([count_vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['indica', 'dominant', 'cross', 'thc', 'aroma', 'hybrid', 'effect', 'cbd', 'cheese', 'strain']


Top 10 words for topic #1:
['consumer', 'aroma', 'dominant', 'body', 'cross', 'og', 'indica', 'effect', 'kush', 'strain']


Top 10 words for topic #2:
['flavor', 'dominant', 'aroma', 'blue', 'sweet', 'indica', 'hybrid', 'effect', 'sativa', 'strain']


Top 10 words for topic #3:
['hybrid', 'seed', 'herer', 'cross', 'nan', 'effect', 'jack', 'sativa', 'haze', 'strain']


Top 10 words for topic #4:
['high', 'pineapple', 'durban', 'strain', 'poison', 'thc', 'scout', 'girl', 'cbd', 'cooky']


Top 10 words for topic #5:
['berry', 'aroma', 'hybrid', 'sativa', 'green', 'haze', 'indica', 'bud', 'strain', 'purple']


Top 10 words for topic #6:
['strawberry', 'dominant', 'indica', 'aroma', 'bud', 'hybrid', 'sweet', 'effect', 'sativa', 'strain']


Top 10 words for topic #7:
['effect', 'indica', 'sativa', 'bud', 'sour', 'diesel', 'high', 'seed', 'hybrid', 'strain']


Top 10 wor

In [8]:
topic_values = LDA.transform(doc_term_matrix)  
topic_values.shape

(2349, 10)

# NMF for Topic Modeling in Python

In [9]:
import pandas as pd  
import numpy as np
documents=pd.read_excel("../CannaConnect/Dataset/description_clean.xlsx")
documents=documents.Description.astype(str)
documents.head()

0     og   hybrid strain pack strong punch name sup...
1     aloha white widow especially potent cut white...
2     sativa dominant hybrid bred spain medical see...
3     dawgs hybrid g chemdawg genetics bred canadia...
4    known kosher tangie k gold  indica dominant hy...
Name: Description, dtype: object

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(max_df=0.8, min_df=2, stop_words='english')  
doc_term_matrix = tfidf_vect.fit_transform(documents.values.astype('U'))

In [11]:
from sklearn.decomposition import NMF

nmf = NMF(n_components=10, random_state=42)  
nmf.fit(doc_term_matrix )

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=10, random_state=42, shuffle=False, solver='cd', tol=0.0001,
  verbose=0)

In [12]:
import random

for i in range(10):  
    random_id = random.randint(0,len(tfidf_vect.get_feature_names()))
    print(tfidf_vect.get_feature_names()[random_id])

turkish
vigor
standard
lift
hydroponic
real
went
nutrient
shiskaberry
season


In [13]:
first_topic = nmf.components_[0]  
top_topic_words = first_topic.argsort()[-10:]

In [14]:
for i in top_topic_words:  
    print(tfidf_vect.get_feature_names()[i])

produce
sweet
week
flower
plant
strain
bud
kush
indica
purple


In [15]:
for i,topic in enumerate(nmf.components_):  
    print(f'Top 10 words for topic #{i}:')
    print([tfidf_vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['produce', 'sweet', 'week', 'flower', 'plant', 'strain', 'bud', 'kush', 'indica', 'purple']


Top 10 words for topic #1:
['finding', 'final', 'filter', 'film', 'filling', 'filled', 'file', 'finally', 'zoning', 'nan']


Top 10 words for topic #2:
['potent', 'strain', 'body', 'pine', 'alien', 'sfv', 'indica', 'lemon', 'kush', 'og']


Top 10 words for topic #3:
['cross', 'uplifting', 'effect', 'chemdawg', 'nyc', 'hybrid', 'fuel', 'sativa', 'sour', 'diesel']


Top 10 words for topic #4:
['nd', 'strain', 'st', 'time', 'denver', 'competed', 'took', 'place', 'cannabis', 'cup']


Top 10 words for topic #5:
['light', 'lemon', 'seed', 'skunk', 'strain', 'jack', 'silver', 'super', 'sativa', 'haze']


Top 10 words for topic #6:
['effect', 'flavor', 'sweet', 'dominant', 'sativa', 'hybrid', 'berry', 'blueberry', 'dream', 'blue']


Top 10 words for topic #7:
['cross', 'terpene', 'scout', 'cherry', 'girl', 'effect', 'consumer', 'physical', 'strain', 'cooky']


Top 10 words 

In [16]:
topic_values = nmf.transform(doc_term_matrix)  
documents['Topic'] = topic_values.argmax(axis=1)  
documents['Topic']

array([2, 9, 5, ..., 4, 2, 7], dtype=int64)