# Topic Modelling

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import gensim
from gensim import corpora
from pprint import pprint

Caso esteja a utilizar o google colab, deve seguir as instruções disponíveis no moodle
e correr a célula seguinte

In [None]:
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive')
    %cd /content/drive/MyDrive/Iscte/TM/data
    
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('omw-1.4')

### Defining the documents to process

In [None]:
# topics: health and sugar
doc1 = "sugar is bad to health."
doc5 = "health experts say that sugar is not good for your lifestyle."
doc11 = "my health is important, so I don't use sugar."
doc12 = "a good lifestyle means less blood pressure and a long life."
doc13 = "my life is important to me. so I practice sports."
doc19 = "My sister likes to have sugar, but not my father"

# driving 
doc2 = "my father is driving my sister around to dance practice by car."
doc7 = "my father does not need driving me there."
doc14 = "I love driving my car."
doc15 = "I am driving to relax. I really love my car"
doc16 = "driving my sister home is realy nice"

# school
doc3 = "my school is great, I love to study there"
doc4 = "sometimes I feel happy to perform well at school."
doc6 = "I am doing well at school, but my sister could study a little more."
doc17 = "I study everyday, and I love the school."
doc18 = "my school is the best."

# compile documents
doc_complete = [doc1, doc2, doc3, doc4, doc5, doc6, doc7, doc11, doc12, doc13, doc14, doc15, doc16, doc17, doc18, doc19]

In [None]:
print(doc_complete)

# Cleaning and Preprocessing

In [None]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in doc_complete]
print(doc_clean)

# Document-Term Matrix

In [None]:
# Creating the term dictionary of our corpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

pprint(doc_term_matrix)

# LDA Model

In [None]:
# Creating the object for the model using gensim library
lda = gensim.models.ldamodel.LdaModel

# Running and Training the model on the document term matrix.
ldamodel = lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=100)

# Results
ldamodel.print_topics(num_topics=3, num_words=4)

In [None]:
doc = clean("My father driving my sister around to dance practice.")
docrep = dictionary.doc2bow(doc.split())
pprint(ldamodel.get_document_topics(docrep))

In [None]:
pprint(ldamodel[docrep])

In [None]:
pprint(ldamodel[doc_term_matrix[1]])

In [None]:
doc = clean("I like sugar very much")
docrep = dictionary.doc2bow(doc.split())
pprint(ldamodel[docrep])

In [None]:
ldamodel.get_term_topics(dictionary.token2id["sugar"], minimum_probability=0.001)

# LSA Model

In [None]:
# Creating the object for the model using gensim library
lsa = gensim.models.lsimodel.LsiModel

# Running and Training the model on the document term matrix.
lsamodel = lsa(doc_term_matrix, num_topics=3, id2word = dictionary)

# Results
lsamodel.print_topics(num_topics=3, num_words=4)

In [None]:
pprint(lsamodel.projection.u[dictionary.token2id["father"]])

In [None]:
print(lsamodel.projection.u.shape)
pprint(lsamodel.projection.s[2])

In [None]:
lsamodel.show_topic(0, topn=5)

In [None]:
#print(lsamodel[doc_term_matrix[1]])
doc = clean("I like sugar very much")
docrep = dictionary.doc2bow(doc.split())
print(lsamodel[docrep])

# Possible Improvements


* Filtering
* Part of Speech Tag Filter
* Chunks (Parsing)
* NER


## IDF Filtering

In [None]:
dictionary = corpora.Dictionary(doc_clean)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [None]:
tfidf = gensim.models.tfidfmodel.TfidfModel
tfidfmodel = tfidf(doc_term_matrix, id2word = dictionary)
print(tfidfmodel.id2word)
print(tfidfmodel.dfs)

In [None]:
voc = {}
for i in range(len(tfidfmodel.id2word)):
    if tfidfmodel.dfs[i] > 1:
        voc[tfidfmodel.id2word[i]] = tfidfmodel.idfs[i]
print(len(voc), voc)

In [None]:
sel_features=sorted(voc, key=voc.__getitem__, reverse=False)
print(sel_features)

In [None]:
new_doc_clean = [[w for w in doc if w in sel_features] for doc in doc_clean]
dictionary = corpora.Dictionary(new_doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in new_doc_clean]

# Now you can create new topic models using the selected vocabulary

In [None]:
print(new_doc_clean)

## Testing LDA and LSA again

### LDA Model

In [None]:
lda = gensim.models.ldamodel.LdaModel
ldamodel = lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=100)
pprint(ldamodel.print_topics(num_topics=3, num_words=4))

### LSA Model

In [None]:
lsa = gensim.models.lsimodel.LsiModel
lsamodel = lsa(doc_term_matrix, num_topics=3, id2word = dictionary)

pprint(lsamodel.print_topics(num_topics=3, num_words=4))

In [None]:
print(lsamodel.projection.u.shape)

## Chunking

In [None]:
import nltk.chunk

nltk.download('conll2000')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

def conll_tag_chunks(chunk_sents):
    tag_sents = [nltk.chunk.tree2conlltags(tree) for tree in chunk_sents]
    return [[(t, c) for (w, t, c) in chunk_tags] for chunk_tags in tag_sents]

conll_train = nltk.corpus.conll2000.chunked_sents('train.txt')
conll_test = nltk.corpus.conll2000.chunked_sents('test.txt')
train_chunks = conll_tag_chunks(conll_train)
ubt_chunker = nltk.tag.TrigramTagger(train_chunks)

In [None]:
sentence = 'John ate an apple'
pos_tags = nltk.pos_tag(sentence.split())
print(pos_tags)

In [None]:
chunks = ubt_chunker.tag([t for w,t in pos_tags])
print(chunks)

In [None]:
def get_chunks(docs):
    newdocs=[]
    for doc in docs:
        tags = nltk.pos_tag(nltk.word_tokenize(doc))
        chunks = ubt_chunker.tag([t for w,t in tags])
        phrase=[]
        for i in range(len(tags)):
            #print(tags[i], chunks[i])
            if chunks[i][1] == None or not chunks[i][1].startswith("I"):
                phrase.append(" "+tags[i][0])
            else:
                phrase.append("_"+tags[i][0])
        newdocs.append("".join(phrase).split())
    return newdocs

get_chunks(['John ate an apple', 'the computer is in the machine room'])

In [None]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)

def clean2(docs):
    res = []
    for doc in docs:
        punc_free = ' '.join(ch for ch in doc if ch not in exclude)
        normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
        res.append([w for w in normalized.lower().split() if w not in stop])
    return res

chunks = get_chunks(['John ate an apple', 'the computer is in the machine room'])
clean2(chunks)

In [None]:
doc_clean = clean2(get_chunks(doc_complete))
print(doc_clean)

## Testing LDA and LSA again¶

In [None]:
dictionary = corpora.Dictionary(doc_clean)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
print(doc_term_matrix)

### LDA Model

In [None]:
lda = gensim.models.ldamodel.LdaModel
ldamodel = lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)
pprint(ldamodel.print_topics(num_topics=3, num_words=4))

### LSA Model

In [None]:
lsa = gensim.models.lsimodel.LsiModel
lsamodel = lsa(doc_term_matrix, num_topics=3, id2word = dictionary)
pprint(lsamodel.print_topics(num_topics=3, num_words=4))

# Intrinsic Evaluation

## Coherence Models

In [None]:
from gensim.models import CoherenceModel

In [None]:
coherence_model_lda = CoherenceModel(model=ldamodel, texts=doc_clean, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

In [None]:
coherence_model_lsa = CoherenceModel(model=lsamodel, texts=doc_clean, dictionary=dictionary, coherence='c_v')
coherence_lsa = coherence_model_lsa.get_coherence()
print('Coherence Score: ', coherence_lsa)

# Experimenting with an embeddings-based model
BERTopic: https://maartengr.github.io/BERTopic/index.html

In [None]:
#!pip install bertopic

In [None]:
from bertopic import BERTopic
topic_model = BERTopic(min_topic_size=3)
topics, probs = topic_model.fit_transform(doc_complete)

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.get_topic(0)

In [None]:
%matplotlib inline
topic_model.visualize_barchart()