# Exercise 1
Implement the LSI using SVD. Use the the documents given below as
samples.

__Note__: Use Numpy's linalg package to understand the internals of SVD
1. information extraction systems
2. natural language processing
3. speech signal systems
4. speech processing

Use "speech systems" as the query to find out whether the results match
the answer for Question 5 (Assignment 2).

In [1]:
from gensim import corpora, similarities
from gensim import models

In [2]:
# Read (construct) the Corpus
corpus = [
    "information extraction systems",
    "natural language processing",
    "speech signal systems",
    "speech processing"
]

In [3]:
# Tokenize
corpus_text = [doc.split() for doc in corpus]
corpus_text

[['information', 'extraction', 'systems'],
 ['natural', 'language', 'processing'],
 ['speech', 'signal', 'systems'],
 ['speech', 'processing']]

In [4]:
# Reference: https://radimrehurek.com/gensim/tut2.html
# Create Coprus Dictionary
dictionary = corpora.Dictionary(corpus_text)

In [5]:
# Create Document-Term Matrix (Bag-of-Words)
doc_term_matrix = [dictionary.doc2bow(tokens) for tokens in corpus_text]

In [6]:
doc_term_matrix

[[(0, 1), (1, 1), (2, 1)],
 [(3, 1), (4, 1), (5, 1)],
 [(2, 1), (6, 1), (7, 1)],
 [(5, 1), (7, 1)]]

In [7]:
# Create TF-IDF Matrix - by fit(ting) the model
tfidf = models.TfidfModel(doc_term_matrix)

In [8]:
# Transform (Apply) the model
corpus_tfidf = tfidf[doc_term_matrix]

In [9]:
# Create LSI Transformation
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary)

In [10]:
lsi.print_topics()

[(0,
  '0.606*"speech" + 0.523*"processing" + 0.398*"signal" + 0.256*"systems" + 0.233*"natural" + 0.233*"language" + 0.114*"extraction" + 0.114*"information"'),
 (1,
  '-0.434*"extraction" + -0.434*"information" + -0.379*"systems" + 0.368*"natural" + 0.368*"language" + -0.323*"signal" + 0.321*"processing" + -0.025*"speech"'),
 (2,
  '0.472*"information" + 0.472*"extraction" + 0.400*"natural" + 0.400*"language" + -0.351*"signal" + -0.324*"speech" + 0.060*"systems" + 0.051*"processing"'),
 (3,
  '0.600*"signal" + -0.437*"processing" + 0.351*"language" + 0.351*"natural" + -0.313*"speech" + 0.214*"systems" + -0.172*"extraction" + -0.172*"information"')]

In [11]:
lsi.show_topics()

[(0,
  '0.606*"speech" + 0.523*"processing" + 0.398*"signal" + 0.256*"systems" + 0.233*"natural" + 0.233*"language" + 0.114*"extraction" + 0.114*"information"'),
 (1,
  '-0.434*"extraction" + -0.434*"information" + -0.379*"systems" + 0.368*"natural" + 0.368*"language" + -0.323*"signal" + 0.321*"processing" + -0.025*"speech"'),
 (2,
  '0.472*"information" + 0.472*"extraction" + 0.400*"natural" + 0.400*"language" + -0.351*"signal" + -0.324*"speech" + 0.060*"systems" + 0.051*"processing"'),
 (3,
  '0.600*"signal" + -0.437*"processing" + 0.351*"language" + 0.351*"natural" + -0.313*"speech" + 0.214*"systems" + -0.172*"extraction" + -0.172*"information"')]

In [12]:
# Create Query Document
query_doc = "speech systems"
query_vec_bow = dictionary.doc2bow(query_doc.split())

In [13]:
# convert the query to LSI space
query_vec_lsi = lsi[query_vec_bow]
query_vec_lsi

[(0, 0.8622500379254158),
 (1, -0.403157689387837),
 (2, -0.2639245139336679),
 (3, -0.09848073003635022)]

In [14]:
# transform corpus to LSI space and index it
# Reference: https://radimrehurek.com/gensim/tut3.html
index = similarities.MatrixSimilarity(lsi[corpus_tfidf])

In [15]:
# Obtain similarity against all the documents in the corpus
sims = index[query_vec_lsi]

In [16]:
print(list(enumerate(sims)))

[(0, 0.33579946), (1, 0.0), (2, 0.82253736), (3, 0.7123382)]


# Exercise 2

Try to improve the incomplete code (shown during the demo 2)(Topic
modeling using BBC corpus) from the github

<https://github.com/Ramaseshanr/anlp/blob/master/TopicModeling_LSI.ipynb>

Once completed, post your experience in the discussion forum for the benefit of others

In [17]:
import os
import pathlib
import gensim
from gensim.models import LsiModel
from gensim import models
from gensim import corpora
from gensim.utils import lemmatize
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem.snowball import EnglishStemmer # Yuva
from nltk.stem import WordNetLemmatizer # Yuva
from gensim.parsing.preprocessing import remove_stopwords, stem_text
from gensim.parsing.preprocessing import strip_numeric, strip_short,strip_multiple_whitespaces,strip_non_alphanum,strip_punctuation,strip_tags,preprocess_string
import pandas as pd
from gensim import similarities
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
from pprint import pprint

[nltk_data] Downloading package wordnet to /home/ubuntujs/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
#read the data
corpus_file_name = pathlib.Path("bbc-text.csv")

if corpus_file_name.exists():
    corpus_dir = corpus_file_name
else:
    corpus_dir = 'https://raw.githubusercontent.com/Ramaseshanr/anlp/master/corpus/bbc-text.csv'
    
df_corpus = pd.read_csv(corpus_dir,names=['category', 'text'])

if not corpus_file_name.exists():
    df_corpus.to_csv(corpus_file_name,index=False,header=False)

In [19]:
corpus = df_corpus['text'].values.tolist()
corpus = corpus[1:]

In [20]:
my_filter = [
    lambda x: x.lower(), strip_tags, strip_punctuation,
    strip_multiple_whitespaces, strip_numeric,
    remove_stopwords, strip_short, stem_text
]

def preprocessing(corpus):
    stemmer = EnglishStemmer()
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')
    for document in corpus:
        doc = strip_numeric(document)
        doc = strip_multiple_whitespaces(doc)
        doc = remove_stopwords(doc)
        strip_tags(doc)
        doc = strip_short(doc,3)
        #doc = stem_text(doc)
        doc = strip_punctuation(doc)
        tokens = gensim.utils.tokenize(doc, lower=True)
        tokens = [token for token in tokens if token not in stop_words ]
        #tokens = [stemmer.stem(token) for token in tokens]
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        yield tokens
        #yield gensim.utils.tokenize(doc, lower=True)

def preprocessing2(corpus):
    for document in corpus:
        doc = strip_numeric(document)
        doc = strip_multiple_whitespaces(doc)
        doc = remove_stopwords(doc)
        strip_tags(doc)
        doc = strip_short(doc,3)
        #doc = stem_text(doc)
        doc = strip_punctuation(doc)
        yield gensim.utils.tokenize(doc, lower=True)

#texts = preprocessing(corpus)
#for text in texts:
#    print(list(text))

texts = preprocessing(corpus)
dictionary = corpora.Dictionary(texts)

In [21]:
dictionary.save_as_text("corpus_dict.txt")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [22]:
corpus_vocab = dictionary.itervalues()
print("Corpus Vocab Size: ", len(corpus_vocab), ", Corpus Dictionary Size: ", len(dictionary.token2id))

with open('corpus_vocab.txt', 'w') as f:
    v = sorted(corpus_vocab, key=len)
    for w in v:
        f.write(w + '\n')

Corpus Vocab Size:  24616 , Corpus Dictionary Size:  24616


In [23]:
dictionary.filter_extremes(no_below=1, keep_n=12000)
print('After filtering: ', len(dictionary.itervalues()))
dictionary.save_as_text("corpus_dict2.txt")

After filtering:  12000


In [24]:
doc_term_matrix = [dictionary.doc2bow(tokens) for tokens in preprocessing(corpus)]
tfidf = models.TfidfModel(doc_term_matrix)
corpus_tfidf = tfidf[doc_term_matrix]

In [25]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary)  # initialize an LSI transformation

In [26]:
lsi.projection.k

200

In [27]:
lsi.projection.s

array([6.84438287, 4.7294355 , 4.37989994, 4.12825674, 4.03692568,
       3.47304033, 3.31738583, 3.1670767 , 3.13151107, 3.0595175 ,
       3.03531024, 2.99951209, 2.91873282, 2.86114222, 2.85054328,
       2.76950663, 2.68477085, 2.6449747 , 2.63223183, 2.61855186,
       2.58984396, 2.53109146, 2.51503901, 2.50642451, 2.49274855,
       2.46360968, 2.41743899, 2.38427969, 2.36736184, 2.35643889,
       2.35142666, 2.32585842, 2.31145166, 2.29339314, 2.27518011,
       2.26760211, 2.25906912, 2.24144481, 2.23393806, 2.21989762,
       2.20978342, 2.19496154, 2.14855006, 2.13594448, 2.13056783,
       2.12247875, 2.10679178, 2.09845711, 2.09480561, 2.08223954,
       2.07409489, 2.06831034, 2.05892253, 2.05592352, 2.03213908,
       2.02945454, 2.01468859, 2.00805544, 2.00178233, 1.99407337,
       1.98659703, 1.98004498, 1.97729875, 1.95860258, 1.95516093,
       1.94318695, 1.93510996, 1.93159055, 1.92364755, 1.91805095,
       1.90929183, 1.90020473, 1.89376974, 1.8870275 , 1.87990

In [28]:
pprint(lsi.print_topics(num_topics=5, num_words=25))

[(0,
  '-0.121*"labour" + -0.117*"election" + -0.102*"blair" + -0.101*"film" + '
  '-0.100*"game" + -0.099*"tax" + -0.098*"brown" + -0.093*"party" + '
  '-0.092*"government" + -0.088*"people" + -0.086*"tory" + -0.084*"bn" + '
  '-0.082*"mobile" + -0.075*"minister" + -0.075*"economy" + -0.072*"music" + '
  '-0.072*"service" + -0.068*"phone" + -0.067*"new" + -0.066*"best" + '
  '-0.065*"sale" + -0.064*"market" + -0.064*"howard" + -0.064*"growth" + '
  '-0.064*"rate"'),
 (1,
  '-0.265*"labour" + -0.252*"election" + -0.215*"blair" + -0.201*"brown" + '
  '-0.192*"tax" + -0.188*"tory" + -0.186*"party" + 0.176*"film" + 0.134*"game" '
  '+ -0.128*"chancellor" + 0.122*"award" + -0.121*"howard" + 0.117*"mobile" + '
  '-0.111*"minister" + -0.106*"prime" + -0.105*"government" + 0.099*"music" + '
  '0.097*"best" + 0.088*"phone" + -0.085*"lib" + -0.085*"conservative" + '
  '0.081*"player" + -0.079*"kennedy" + -0.071*"budget" + -0.070*"leader"'),
 (2,
  '-0.200*"mobile" + 0.187*"film" + -0.153*"phone