## Topic Modelling
This notebook contains a demo of LDA and LSA using the gensim library. The dataset's link can be found in the `BookSummaries_Link.md` file under the Data folder in Ch7.

In [None]:
# Import OS 
import os
# For NLTK virtual environments are high recommended and it requires python verisions higher than 3.5
!pip install gensim
!pip install nltk



In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

from gensim.models import LdaModel
from gensim.corpora import Dictionary
from pprint import pprint


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
%%bash
wget -q http://www.cs.cmu.edu/~dbamman/data/booksummaries.tar.gz
tar -xzf booksummaries.tar.gz

In [None]:
#tokenize, remove stopwords, non-alphabetic words, lowercase
def preprocess(textstring):
   stops =  set(stopwords.words('english'))
   tokens = word_tokenize(textstring)
   return [token.lower() for token in tokens if token.isalpha() and token not in stops]

# This is a sample path of your downloaded data set. This is currently set to a windows based path . 
# Please update it to your actual download path regradless of your choice of operating system 

#data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),data)
data_path = '/content/booksummaries/booksummaries.txt'

summaries = []
for line in open(data_path, encoding="utf-8"):
   temp = line.split("\t")
   summaries.append(preprocess(temp[6]))

# Create a dictionary representation of the documents.

dictionary = Dictionary(summaries)

# Filter infrequent or too frequent words.

dictionary.filter_extremes(no_below=10, no_above=0.5)
corpus = [dictionary.doc2bow(summary) for summary in summaries]

# Make a index to word dictionary.

temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

#Train the topic model

model = LdaModel(corpus=corpus, id2word=id2word,iterations=400, num_topics=10)
top_topics = list(model.top_topics(corpus))
pprint(top_topics)

[([(0.008060368, 'he'),
   (0.0057050143, 'one'),
   (0.0051691257, 'school'),
   (0.004808169, 'father'),
   (0.004773357, 'mother'),
   (0.0041268375, 'home'),
   (0.003957477, 'family'),
   (0.003601159, 'new'),
   (0.003560975, 'house'),
   (0.003529598, 'day'),
   (0.0035204382, 'after'),
   (0.0034081945, 'she'),
   (0.0033485845, 'later'),
   (0.003305568, 'they'),
   (0.0032622234, 'back'),
   (0.003201671, 'tells'),
   (0.0031764603, 'when'),
   (0.0031710449, 'two'),
   (0.0031436537, 'time'),
   (0.0031340444, 'also')],
  -0.8280939923822519),
 ([(0.0070735016, 'he'),
   (0.005110412, 'one'),
   (0.0047350083, 'they'),
   (0.0042130165, 'back'),
   (0.003993648, 'find'),
   (0.0038619696, 'king'),
   (0.0032707788, 'two'),
   (0.003195354, 'also'),
   (0.0031643172, 'after'),
   (0.0030267006, 'city'),
   (0.0030206405, 'however'),
   (0.0029733991, 'help'),
   (0.0029359192, 'world'),
   (0.00289766, 'in'),
   (0.0028849225, 'way'),
   (0.002859651, 'time'),
   (0.002658007

In [None]:
for idx in range(10):
    print("Topic #%s:" % idx, model.print_topic(idx, 10))
print("=" * 20)

Topic #0: 0.008*"ship" + 0.004*"one" + 0.004*"jack" + 0.004*"new" + 0.004*"crew" + 0.004*"he" + 0.004*"captain" + 0.004*"team" + 0.003*"two" + 0.003*"also"
Topic #1: 0.006*"one" + 0.005*"time" + 0.005*"earth" + 0.004*"human" + 0.004*"new" + 0.004*"in" + 0.004*"he" + 0.004*"world" + 0.004*"planet" + 0.003*"people"
Topic #2: 0.005*"tells" + 0.005*"he" + 0.005*"house" + 0.005*"she" + 0.005*"one" + 0.004*"find" + 0.004*"murder" + 0.004*"police" + 0.004*"david" + 0.004*"back"
Topic #3: 0.007*"novel" + 0.006*"book" + 0.006*"story" + 0.006*"life" + 0.005*"family" + 0.005*"one" + 0.004*"he" + 0.004*"also" + 0.004*"in" + 0.004*"first"
Topic #4: 0.007*"world" + 0.006*"earth" + 0.005*"time" + 0.005*"one" + 0.004*"new" + 0.004*"vlad" + 0.003*"city" + 0.003*"human" + 0.003*"story" + 0.003*"dragon"
Topic #5: 0.008*"book" + 0.006*"richard" + 0.006*"one" + 0.005*"heaven" + 0.004*"in" + 0.004*"also" + 0.004*"luke" + 0.004*"he" + 0.003*"it" + 0.003*"new"
Topic #6: 0.007*"he" + 0.005*"one" + 0.005*"they"

In [None]:
from gensim.models import LsiModel
lsamodel = LsiModel(corpus, num_topics=10, id2word = id2word)  # train model

pprint(lsamodel.print_topics(num_topics=10, num_words=10))


[(0,
  '0.305*"he" + 0.215*"one" + 0.150*"she" + 0.140*"time" + 0.132*"back" + '
  '0.131*"also" + 0.127*"two" + 0.125*"they" + 0.123*"tells" + 0.118*"in"'),
 (1,
  '0.494*"tom" + 0.226*"sophia" + 0.182*"mrs" + 0.178*"house" + 0.160*"she" + '
  '0.154*"father" + 0.148*"mr" + 0.146*"he" + 0.138*"tells" + -0.125*"one"'),
 (2,
  '0.557*"tom" + 0.251*"sophia" + -0.213*"she" + -0.191*"he" + 0.185*"mrs" + '
  '-0.163*"tells" + -0.144*"mother" + 0.136*"mr" + 0.129*"western" + '
  '0.102*"however"'),
 (3,
  '0.236*"they" + 0.203*"ship" + 0.184*"david" + -0.181*"he" + 0.180*"back" + '
  '0.166*"tells" + -0.164*"family" + -0.163*"life" + 0.155*"find" + '
  '-0.153*"narrator"'),
 (4,
  '-0.668*"he" + 0.255*"mother" + 0.212*"she" + 0.192*"father" + '
  '0.178*"family" + -0.124*"narrator" + -0.120*"monk" + 0.098*"school" + '
  '0.097*"novel" + 0.093*"children"'),
 (5,
  '0.484*"david" + -0.245*"king" + 0.170*"rosa" + 0.164*"book" + '
  '0.126*"harlan" + -0.115*"anita" + -0.113*"he" + 0.111*"she" + 

In [None]:
for idx in range(10):
    print("Topic #%s:" % idx, lsamodel.print_topic(idx, 10))
 
print("=" * 20)

Topic #0: 0.305*"he" + 0.215*"one" + 0.150*"she" + 0.140*"time" + 0.132*"back" + 0.131*"also" + 0.127*"two" + 0.125*"they" + 0.123*"tells" + 0.118*"in"
Topic #1: 0.494*"tom" + 0.226*"sophia" + 0.182*"mrs" + 0.178*"house" + 0.160*"she" + 0.154*"father" + 0.148*"mr" + 0.146*"he" + 0.138*"tells" + -0.125*"one"
Topic #2: 0.557*"tom" + 0.251*"sophia" + -0.213*"she" + -0.191*"he" + 0.185*"mrs" + -0.163*"tells" + -0.144*"mother" + 0.136*"mr" + 0.129*"western" + 0.102*"however"
Topic #3: 0.236*"they" + 0.203*"ship" + 0.184*"david" + -0.181*"he" + 0.180*"back" + 0.166*"tells" + -0.164*"family" + -0.163*"life" + 0.155*"find" + -0.153*"narrator"
Topic #4: -0.668*"he" + 0.255*"mother" + 0.212*"she" + 0.192*"father" + 0.178*"family" + -0.124*"narrator" + -0.120*"monk" + 0.098*"school" + 0.097*"novel" + 0.093*"children"
Topic #5: 0.484*"david" + -0.245*"king" + 0.170*"rosa" + 0.164*"book" + 0.126*"harlan" + -0.115*"anita" + -0.113*"he" + 0.111*"she" + 0.109*"gould" + 0.106*"would"
Topic #6: 0.697*"a