# Documents

In [1]:
# topics: health and sugar
doc1 = "sugar is bad to health."
doc5 = "health experts say that sugar is not good for your lifestyle."
doc11 = "my health is important, so I don't use sugar."
doc12 = "a good lifestyle means less blood pressure and a long life."
doc13 = "my life is important to me. so I practice sports."
doc19 = "My sister likes to have sugar, but not my father"

# driving 
doc2 = "my father is driving my sister around to dance practice by car."
doc7 = "my father does not need driving me there."
doc14 = "I love driving my car."
doc15 = "I am driving to relax. I really love my car"
doc16 = "driving my sister home is realy nice"

# school
doc3 = "my school is great, I love to study there"
doc4 = "sometimes I feel happy to perform well at school."
doc6 = "I am doing well at school, but my sister could study a little more."
doc17 = "I study everyday, and I love the school."
doc18 = "my school is the best."

# compile documents
doc_complete = [doc1, doc2, doc3, doc4, doc5, doc6, doc7, doc11, doc12, doc13, doc14, doc15, doc16, doc17, doc18, doc19]

# Cleaning and Preprocessing

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in doc_complete]

[nltk_data] Downloading package stopwords to /home/fmmb/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/fmmb/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/fmmb/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
print(doc_clean)

[['sugar', 'bad', 'health'], ['father', 'driving', 'sister', 'around', 'dance', 'practice', 'car'], ['school', 'great', 'love', 'study'], ['sometimes', 'feel', 'happy', 'perform', 'well', 'school'], ['health', 'expert', 'say', 'sugar', 'good', 'lifestyle'], ['well', 'school', 'sister', 'could', 'study', 'little', 'more'], ['father', 'need', 'driving', 'there'], ['health', 'important', 'use', 'sugar'], ['good', 'lifestyle', 'mean', 'le', 'blood', 'pressure', 'long', 'life'], ['life', 'important', 'me', 'practice', 'sport'], ['love', 'driving', 'car'], ['driving', 'relax', 'really', 'love', 'car'], ['driving', 'sister', 'home', 'realy', 'nice'], ['study', 'everyday', 'love', 'school'], ['school', 'best'], ['sister', 'like', 'sugar', 'father']]


# Document-Term Matrix

In [4]:
# Importing Gensim
import gensim
from gensim import corpora
from pprint import pprint  

# Creating the term dictionary of our courpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

print(doc_term_matrix)

[[(0, 1), (1, 1), (2, 1)], [(3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)], [(10, 1), (11, 1), (12, 1), (13, 1)], [(12, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1)], [(1, 1), (2, 1), (19, 1), (20, 1), (21, 1), (22, 1)], [(9, 1), (12, 1), (13, 1), (18, 1), (23, 1), (24, 1), (25, 1)], [(6, 1), (7, 1), (26, 1), (27, 1)], [(1, 1), (2, 1), (28, 1), (29, 1)], [(20, 1), (21, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1)], [(8, 1), (28, 1), (32, 1), (36, 1), (37, 1)], [(4, 1), (6, 1), (11, 1)], [(4, 1), (6, 1), (11, 1), (38, 1), (39, 1)], [(6, 1), (9, 1), (40, 1), (41, 1), (42, 1)], [(11, 1), (12, 1), (13, 1), (43, 1)], [(12, 1), (44, 1)], [(2, 1), (7, 1), (9, 1), (45, 1)]]


# LDA Model

In [5]:
from pprint import pprint  

# Creating the object for LDA model using gensim library
lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=100)

# Results
pprint(ldamodel.print_topics(num_topics=3, num_words=4))

[(0, '0.062*"father" + 0.061*"sister" + 0.061*"driving" + 0.061*"sugar"'),
 (1, '0.061*"love" + 0.060*"driving" + 0.060*"school" + 0.043*"health"'),
 (2, '0.046*"school" + 0.046*"good" + 0.046*"lifestyle" + 0.046*"life"')]


In [6]:
doc = clean("My father driving my sister around to dance practice.")
docrep = dictionary.doc2bow(doc.split())
pprint(ldamodel.get_document_topics(docrep))

[(0, 0.06026297), (1, 0.89166844), (2, 0.048068546)]


In [7]:
pprint(ldamodel[docrep])

[(0, 0.060196575), (1, 0.89173496), (2, 0.04806852)]


In [8]:
pprint(ldamodel[doc_term_matrix[1]])

[(0, 0.051683377), (1, 0.9062776), (2, 0.042039007)]


In [9]:
doc = clean("I like sugar very much")
docrep = dictionary.doc2bow(doc.split())
pprint(ldamodel[docrep])

[(0, 0.7700631), (1, 0.118043676), (2, 0.111893244)]


In [10]:
ldamodel.get_term_topics(dictionary.token2id["sugar"], minimum_probability=0.001)

[(0, 0.049207557), (1, 0.0337398), (2, 0.001536117)]

# LSA Model

In [11]:
lsa = gensim.models.lsimodel.LsiModel

lsamodel = lsa(doc_term_matrix, num_topics=3, id2word = dictionary)

pprint(lsamodel.print_topics(num_topics=3, num_words=4))
pprint(lsamodel.projection.u[dictionary.token2id["father"]])

[(0, '0.453*"driving" + 0.383*"sister" + 0.331*"school" + 0.320*"love"'),
 (1, '0.460*"school" + 0.308*"study" + -0.283*"sugar" + -0.244*"driving"'),
 (2, '0.324*"sugar" + 0.315*"good" + 0.315*"lifestyle" + 0.305*"health"')]
array([ 0.25774173, -0.20579016, -0.11838895])


In [12]:
print(lsamodel.projection.u.shape)
pprint(lsamodel.projection.s[2])

(46, 3)
3.1821612593996944


In [13]:
lsamodel.show_topic(0, topn=5)

[('driving', 0.4528334198444292),
 ('sister', 0.3831712509341244),
 ('school', 0.3306286089841619),
 ('love', 0.3198014365810732),
 ('car', 0.3109918030030588)]

In [14]:
#print(lsamodel[doc_term_matrix[1]])
doc = clean("I like sugar very much")
docrep = dictionary.doc2bow(doc.split())
print(lsamodel[docrep])

[(0, 0.16373040239929587), (1, -0.342174540917964), (2, 0.3435426652776111)]


# Possible Improvements


* Filtering
* Part of Speech Tag Filter
* Chunks (Parsing)
* NER


## IDF Filtering

In [15]:
dictionary = corpora.Dictionary(doc_clean)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [16]:
tfidf = gensim.models.tfidfmodel.TfidfModel
tfidfmodel = tfidf(doc_term_matrix, id2word = dictionary)
print(tfidfmodel.id2word)
print(tfidfmodel.dfs)

Dictionary<46 unique tokens: ['bad', 'health', 'sugar', 'around', 'car']...>
{0: 1, 1: 3, 2: 4, 3: 1, 4: 3, 5: 1, 6: 5, 7: 3, 8: 2, 9: 4, 10: 1, 11: 4, 12: 5, 13: 3, 14: 1, 15: 1, 16: 1, 17: 1, 18: 2, 19: 1, 20: 2, 21: 2, 22: 1, 23: 1, 24: 1, 25: 1, 26: 1, 27: 1, 28: 2, 29: 1, 30: 1, 31: 1, 32: 2, 33: 1, 34: 1, 35: 1, 36: 1, 37: 1, 38: 1, 39: 1, 40: 1, 41: 1, 42: 1, 43: 1, 44: 1, 45: 1}


In [17]:
voc = {}
for i in range(len(tfidfmodel.id2word)):
    if tfidfmodel.dfs[i] > 1:
        voc[tfidfmodel.id2word[i]] = tfidfmodel.idfs[i]
print(len(voc), voc)

15 {'health': 2.415037499278844, 'sugar': 2.0, 'car': 2.415037499278844, 'driving': 1.6780719051126378, 'father': 2.415037499278844, 'practice': 3.0, 'sister': 2.0, 'love': 2.0, 'school': 1.6780719051126378, 'study': 2.415037499278844, 'well': 3.0, 'good': 3.0, 'lifestyle': 3.0, 'important': 3.0, 'life': 3.0}


In [18]:
sel_features=sorted(voc, key=voc.__getitem__, reverse=False)
print(sel_features)

['driving', 'school', 'sugar', 'sister', 'love', 'health', 'car', 'father', 'study', 'practice', 'well', 'good', 'lifestyle', 'important', 'life']


In [19]:
new_doc_clean = [[w for w in doc if w in sel_features] for doc in doc_clean]
dictionary = corpora.Dictionary(new_doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in new_doc_clean]

# Now you can create new topic models using the selected vocabulary

In [20]:
print(new_doc_clean)

[['sugar', 'health'], ['father', 'driving', 'sister', 'practice', 'car'], ['school', 'love', 'study'], ['well', 'school'], ['health', 'sugar', 'good', 'lifestyle'], ['well', 'school', 'sister', 'study'], ['father', 'driving'], ['health', 'important', 'sugar'], ['good', 'lifestyle', 'life'], ['life', 'important', 'practice'], ['love', 'driving', 'car'], ['driving', 'love', 'car'], ['driving', 'sister'], ['study', 'love', 'school'], ['school'], ['sister', 'sugar', 'father']]


## Testing LDA and LSA again

### LDA Model

In [21]:
lda = gensim.models.ldamodel.LdaModel
ldamodel = lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=100)
pprint(ldamodel.print_topics(num_topics=3, num_words=4))

[(0, '0.247*"school" + 0.154*"study" + 0.108*"well" + 0.106*"good"'),
 (1, '0.267*"sugar" + 0.206*"health" + 0.145*"important" + 0.084*"life"'),
 (2, '0.229*"driving" + 0.155*"love" + 0.144*"sister" + 0.143*"car"')]


### LSA Model

In [22]:
lsa = gensim.models.lsimodel.LsiModel
lsamodel = lsa(doc_term_matrix, num_topics=3, id2word = dictionary)

pprint(lsamodel.print_topics(num_topics=3, num_words=4))

[(0, '0.502*"driving" + 0.385*"sister" + 0.382*"love" + 0.359*"car"'),
 (1, '0.529*"school" + 0.382*"study" + -0.370*"sugar" + -0.283*"health"'),
 (2, '-0.501*"sugar" + -0.449*"health" + 0.336*"driving" + -0.325*"school"')]


In [23]:
print(lsamodel.projection.u.shape)

(15, 3)


## Chunking

In [24]:
import nltk.chunk

nltk.download('conll2000')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

def conll_tag_chunks(chunk_sents):
    tag_sents = [nltk.chunk.tree2conlltags(tree) for tree in chunk_sents]
    return [[(t, c) for (w, t, c) in chunk_tags] for chunk_tags in tag_sents]

conll_train = nltk.corpus.conll2000.chunked_sents('train.txt')
conll_test = nltk.corpus.conll2000.chunked_sents('test.txt')
train_chunks = conll_tag_chunks(conll_train)
ubt_chunker = nltk.tag.TrigramTagger(train_chunks)

[nltk_data] Downloading package conll2000 to /home/fmmb/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/fmmb/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/fmmb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [25]:
sentence = 'John ate an apple'
pos_tags = nltk.pos_tag(sentence.split())
print(pos_tags)

[('John', 'NNP'), ('ate', 'VBP'), ('an', 'DT'), ('apple', 'NN')]


In [26]:
chunks = ubt_chunker.tag([t for w,t in pos_tags])
print(chunks)

[('NNP', 'B-NP'), ('VBP', 'B-VP'), ('DT', 'B-NP'), ('NN', 'I-NP')]


In [27]:
def get_chunks(docs):
    newdocs=[]
    for doc in docs:
        tags = nltk.pos_tag(nltk.word_tokenize(doc))
        chunks = ubt_chunker.tag([t for w,t in tags])
        phrase=[]
        for i in range(len(tags)):
            #print(tags[i], chunks[i])
            if chunks[i][1] == None or not chunks[i][1].startswith("I"):
                phrase.append(" "+tags[i][0])
            else:
                phrase.append("_"+tags[i][0])
        newdocs.append("".join(phrase).split())
    return newdocs

get_chunks(['John ate an apple', 'the computer is in the machine room'])

[['John', 'ate', 'an_apple'], ['the_computer', 'is', 'in', 'the_machine_room']]

In [28]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)

def clean2(docs):
    res = []
    for doc in docs:
        punc_free = ' '.join(ch for ch in doc if ch not in exclude)
        normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
        res.append([w for w in normalized.lower().split() if w not in stop])
    return res

chunks = get_chunks(['John ate an apple', 'the computer is in the machine room'])
clean2(chunks)

[['john', 'ate', 'an_apple'], ['the_computer', 'the_machine_room']]

In [29]:
doc_clean = clean2(get_chunks(doc_complete))
print(doc_clean)

[['sugar', 'bad', 'health'], ['my_father', 'is_driving', 'my_sister', 'around', 'dance_practice', 'car'], ['my_school', 'great', 'love_to_study'], ['sometimes', 'feel', 'happy', 'to_perform', 'well', 'school'], ['health_experts', 'say', 'sugar', 'is_not', 'good', 'your_lifestyle'], ['am_doing', 'well', 'school', 'my_sister', 'could_study', 'little'], ['my_father', 'does_not_need_driving'], ['my_health', 'important', "do_n't_use", 'sugar'], ['a_good_lifestyle', 'mean', 'le', 'blood', 'pressure', 'a_long_life'], ['my_life', 'important', 'i_practice_sports'], ['love_driving', 'my_car'], ['am_driving_to_relax', 'really', 'love', 'my_car'], ['driving', 'my_sister_home', 'realy', 'nice'], ['study', 'everyday', 'love', 'the_school'], ['my_school', 'the_best'], ['my_sister_likes', 'to_have', 'sugar', 'my_father']]


## Testing LDA and LSA again¶

In [30]:
dictionary = corpora.Dictionary(doc_clean)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
print(doc_term_matrix)

[[(0, 1), (1, 1), (2, 1)], [(3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)], [(9, 1), (10, 1), (11, 1)], [(12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)], [(2, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1)], [(8, 1), (14, 1), (17, 1), (23, 1), (24, 1), (25, 1)], [(7, 1), (26, 1)], [(2, 1), (27, 1), (28, 1), (29, 1)], [(30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1)], [(28, 1), (36, 1), (37, 1)], [(38, 1), (39, 1)], [(39, 1), (40, 1), (41, 1), (42, 1)], [(43, 1), (44, 1), (45, 1), (46, 1)], [(41, 1), (47, 1), (48, 1), (49, 1)], [(11, 1), (50, 1)], [(2, 1), (7, 1), (51, 1), (52, 1)]]


### LDA Model

In [31]:
lda = gensim.models.ldamodel.LdaModel
ldamodel = lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)
pprint(ldamodel.print_topics(num_topics=3, num_words=4))

[(0, '0.078*"my_car" + 0.045*"love" + 0.045*"well" + 0.045*"school"'),
 (1,
  '0.063*"my_father" + 0.044*"important" + 0.044*"sugar" + '
  '0.025*"dance_practice"'),
 (2, '0.066*"sugar" + 0.039*"my_school" + 0.037*"le" + 0.037*"blood"')]


### LSA Model

In [32]:
lsa = gensim.models.lsimodel.LsiModel
lsamodel = lsa(doc_term_matrix, num_topics=3, id2word = dictionary)
pprint(lsamodel.print_topics(num_topics=3, num_words=4))

[(0,
  '-0.418*"well" + -0.418*"school" + -0.366*"my_sister" + -0.245*"my_father"'),
 (1, '0.621*"sugar" + 0.236*"my_father" + 0.226*"good" + 0.226*"say"'),
 (2,
  '-0.446*"my_father" + -0.306*"around" + -0.306*"car" + '
  '-0.306*"dance_practice"')]


# Intrinsic Evaluation

## Coherence Models

In [33]:
from gensim.models import CoherenceModel

In [34]:
coherence_model_lda = CoherenceModel(model=ldamodel, texts=doc_clean, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.5611155989902622


In [35]:
coherence_model_lsa = CoherenceModel(model=lsamodel, texts=doc_clean, dictionary=dictionary, coherence='c_v')
coherence_lsa = coherence_model_lsa.get_coherence()
print('Coherence Score: ', coherence_lsa)

Coherence Score:  0.49484777767595683


# Experimenting with an embeddings-based model
BERTopic: https://maartengr.github.io/BERTopic/index.html

In [36]:
#!pip install bertopic

In [44]:
from bertopic import BERTopic
topic_model = BERTopic(min_topic_size=3)
topics, probs = topic_model.fit_transform(doc_complete)

In [45]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,10,0_my_driving_sugar_is
1,1,6,1_school_study_my_well


In [46]:
topic_model.get_topic(0)

[('my', 0.19665652711039627),
 ('driving', 0.16091812985458892),
 ('sugar', 0.1389176655354814),
 ('is', 0.13481816460174118),
 ('health', 0.1141791006039511),
 ('car', 0.1141791006039511),
 ('father', 0.1141791006039511),
 ('not', 0.1141791006039511),
 ('to', 0.11369541316881668),
 ('sister', 0.10418824915161105)]

In [47]:
%matplotlib inline
topic_model.visualize_barchart()