In [None]:
## LATENT SEMANTIC ANALYSIS / INDEXING (LSA/LSI)

In [15]:
import nltk
import numpy as np
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [7]:
# Import file and load in list
#------------------------------------------------------------
file = "all_book_titles.txt"
# Load the txt file into a list
book_titles=[]
for line in open(file, encoding='utf-8'):
    book_titles.append(line.rstrip().lower())

In [13]:
# Set a Lemmatizer
lemmatizer = WordNetLemmatizer()

In [14]:
# Define stopwords scope
stop_words = list(set(stopwords.words('english')))
stop_words = stop_words + ['introduction','edition','series','application','approach', 'card', 'access', 'package', 'plus', 'etext',
  'brief', 'vol', 'fundamental', 'guide', 'essential', 'printed','third', 'second', 'fourth', 'volume'] # based on observations 

In [20]:
def my_tokenizer(s):
  # split string into words (tokens)
  tokens = nltk.tokenize.word_tokenize(s)
  # remove short words, they're probably not useful
  tokens = [t for t in tokens if len(t) > 2]
  # put words into base form
  tokens = [lemmatizer.lemmatize(t) for t in tokens]
  # remove stopwords
  tokens = [t for t in tokens if t not in stop_words]
  # remove any digits, i.e. "3rd edition"
  tokens = [t for t in tokens if not any(c.isdigit() for c in t)]

  return tokens

In [21]:
vectorizer = CountVectorizer(binary=True, tokenizer=my_tokenizer)
X = vectorizer.fit_transform(book_titles)

In [22]:
# create index > word map for plotting later

# conceptually what we want to do
# index_word_map = [None] * len(vectorizer.vocabulary_)
# for word, index in vectorizer.vocabulary_.items():
#   index_word_map[index] = word

# but it's already stored in the count vectorizer
index_word_map = vectorizer.get_feature_names_out()

In [23]:
# transpose X to make rows = terms, cols = documents
X = X.T

In [24]:
svd = TruncatedSVD()
Z = svd.fit_transform(X)

In [33]:
import plotly.express as px

fig = px.scatter(x=Z[:,0], y=Z[:,1], text=index_word_map, size_max=60)
fig.update_traces(textposition='top center')
fig.show()