## Import required Libraries.

In [None]:
import os
import numpy as np
import pandas as pd

import nltk
import spacy

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF

In [None]:
nltk.download("punkt_tab")
nltk.download("wornet")
nltk.download("stopwords")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Error loading wornet: Package 'wornet' not found in index
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

## Ingest Data.

In [None]:
documents_df = pd.read_csv('documents.csv')
documents_df.shape

## Explore the Dataset.

In [None]:
print(documents_df['document'][0])

My husband is a born shopper. He loves to look at things and to touch them. He likes to compare prices between the same items in different shops. He would never think of buying anything without looking around in several different shops. On the other hand, I'm not a shopper. I think shopping is boring and unpleasant. If I like something and I have enough money to take it, I buy it at once. I never look around for a good price or a better deal. Of course my husband and I never go shopping together. Doing shopping together would be too painful for both of us. When it comes to shopping, we go our different ways.
Sometimes I ask my son Jimmy to buy some food in the shop not far from our home. But he is always absent-minded. This was his story.
One day I said to him, " I hope you won't forget what I have told you to buy." " No," said Jimmy. "I won't forget. You want three oranges , six eggs and a pound of meat."
He went running down the street to the shop. As he ran, he said to himself over 

## Vectorize the Dataset.

Here we vectorize the dataset as Bag-of-words representation using scikit-learn's CountVectorizer. This format is used by Latent Dirichlet Allocation algorithm.

In [None]:
# create CountVectorizer instance and fit it to the data
vectorizer = CountVectorizer(max_features=2000, stop_words='english')
vectorizer.fit(documents_df['document'])

# Obtain vector embeddings for the documents.
X = vectorizer.transform(documents_df['document'])

## Latent Dirichlet Allocation

In [None]:
# create an instance of LatentDirichletAllocation and fit it to the data.
lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda.fit(X)

In [None]:
# .components_ has n-rows where each row represents a topic.
single_topic = lda.components_[3]

# each topic row has the values of probability of words belonging to that topic
top10_word_idx = np.argsort(single_topic)[-10: ]
top10_words = vectorizer.get_feature_names_out()[top10_word_idx]

# pick the top10 words in that topic.
print(f"The top 10 words for topic #3 are: \n{top10_words}")

## Non-Negative Matrix Vectorization.

The NMF algorithm expects vectors in the format of Tf-IDF.

In [None]:
# create an instance of TfidfVectorizer and fit to the data.
nnm_vectorizer = TfidfVectorizer(max_features=2000, stop_words='english')
nnm_vectorizer.fit(documents_df['document'])

# obtain vector embeddings.
X = nnm_vectorizer.transform(documents_df['document'])

# create an instance of NMF and fit to the data.
nnm = NMF(n_components=10, random_state=42)
nnm.fit(X)

In [None]:
# extract the feature names from the vectorizer.
feature_names = nnm_vectorizer.get_feature_names_out()

# extract top10 words from each topic and print.
for index, topic in enumerate(nnm.components_):

  print(f"The top 10 words for topic #{index} are:\n")
  top10_word_idx = np.argsort(topic)[-10: ]
  top10_words = [feature_names[i] for i in top10_word_idx]
  print(top10_words)

The top 10 words for topic #0 are:

['think', 'good', 'like', 'things', 'time', 'life', 'friends', 'make', 'don', 'people']
The top 10 words for topic #1 are:

['came', 'boy', 'day', 'asked', 'old', 'went', 'father', 'mother', 'man', 'said']
The top 10 words for topic #2 are:

['education', 'high', 'college', 'student', 'teachers', 'teacher', 'class', 'schools', 'school', 'students']
The top 10 words for topic #3 are:

['000', 'country', 'beijing', 'new', 'year', 'people', 'world', 'city', 'chinese', 'china']
The top 10 words for topic #4 are:

['reading', 'read', 'learning', 'languages', 'learn', 'speak', 'chinese', 'words', 'language', 'english']
The top 10 words for topic #5 are:

['fish', 'animals', 'people', 'healthy', 'meat', 'foods', 'vegetables', 'eating', 'eat', 'food']
The top 10 words for topic #6 are:

['young', 'home', 'mother', 'families', 'school', 'family', 'kids', 'child', 'parents', 'children']
The top 10 words for topic #7 are:

['people', 'brain', 'research', 'healt