<a href="https://colab.research.google.com/github/fahmi54321/nlp_latentSemanticAnalysis-CountVectorizer/blob/main/Latent_Semantic_Analysis_Count_Vectorizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/nlp_class/all_book_titles.txt

File ‘all_book_titles.txt’ already there; not retrieving.



In [2]:
import nltk
import numpy as np
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
wordnet_lemmatizer = WordNetLemmatizer()

In [5]:
# read our book titles
titles = [line.rstrip() for line in open('all_book_titles.txt')]

In [6]:
stops = set(stopwords.words('english'))

In [7]:
# note : https://docs.google.com/document/d/1PQ5WnpctYoY_eEPTew25KmeXvSPc8-hp/edit?usp=share_link&ouid=117635670089266886030&rtpof=true&sd=true

stops = stops.union({
  'introduction', 'edition', 'series', 'application',
  'approach', 'card', 'access', 'package', 'plus', 'etext',
  'brief', 'vol', 'fundamental', 'guide', 'essential', 'printed',
  'third', 'second', 'fourth', 'volume'})

In [8]:
#note : https://docs.google.com/document/d/1UbBFQ06tVGhfz19O4gBreLzWWM-nkjqP/edit?usp=share_link&ouid=117635670089266886030&rtpof=true&sd=true

def my_tokenizer(s):
  # downcase
  s = s.lower()

  # split string into words (tokens)
  tokens = nltk.tokenize.word_tokenize(s)

  # remove short words, they're probably not useful
  tokens = [t for t in tokens if len(t) > 2]

  # put words into base form
  tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]

  # remove stopwords
  tokens = [t for t in tokens if t not in stops]

  # remove any digits, i.e. "3rd edition"
  tokens = [t for t in tokens if not any(c.isdigit() for c in t)]

  return tokens

In [9]:
# at this point, we can return the remaining list of
# note : https://docs.google.com/document/d/1aYfEXrcGLeXWURp6nDH7FCOitNucm9fN/edit?usp=share_link&ouid=117635670089266886030&rtpof=true&sd=true
vectorizer = CountVectorizer(binary=True, tokenizer=my_tokenizer)

In [10]:
# convert our list of titles into count matrix
X = vectorizer.fit_transform(titles)



In [11]:
# create index > word map for plotting later

# conceptually what we want to do
# index_word_map = [None] * len(vectorizer.vocabulary_)
# for word, index in vectorizer.vocabulary_.items():
#   index_word_map[index] = word

# but it's already stored in the count vectorizer
# note : https://docs.google.com/document/d/1_cXHQv6sxDDJqyk_k0UDKK0YYpz8NS_O/edit?usp=share_link&ouid=117635670089266886030&rtpof=true&sd=true
index_word_map = vectorizer.get_feature_names_out()

In [12]:
# transpose X to make rows = terms, cols = documents
# note : https://docs.google.com/document/d/12M-9Fbv-NZWDx9yk-TORAlj71_y2ruSg/edit?usp=share_link&ouid=117635670089266886030&rtpof=true&sd=true
X = X.T

In [13]:
# to perform SVD on our term document matrix, which gives us back Z
svd = TruncatedSVD()
Z = svd.fit_transform(X)

In [14]:
# essentially create a scatterplot for Z. CoLab doesn't have interactive plots by default
!pip install plotly --upgrade



In [15]:
import plotly.express as px

In [16]:
# create our scatterplot

fig = px.scatter(x=Z[:,0], y=Z[:,1], text=index_word_map, size_max=60)
fig.update_traces(textposition='top center')
fig.show()