# Documents

In [None]:
doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father."
doc2 = "My father driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
doc4 = "Sometimes I feel pressure to perform well at school, and father is always driving my sister to do better."
doc5 = "Health experts say that Sugar is not good for your lifestyle."
doc6 = "I am doing well at school, but my sister could study a little more."
doc7 = "Currently, I am studying at a nearby school. My father does not need driving me there."
# topics: health and sugar (doc1,doc5), driving (doc2, doc3, doc7), school (doc4, doc6, doc7)

# compile documents
doc_complete = [doc1, doc2, doc3, doc4, doc5, doc6, doc7]

# Cleaning and Preprocessing

In [None]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in doc_complete]

In [None]:
print(doc_clean)

# Document-Term Matrix

In [None]:
# Importing Gensim
import gensim
from gensim import corpora
from pprint import pprint  

# Creating the term dictionary of our courpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

print(doc_term_matrix)

# LDA Model

In [None]:
from pprint import pprint  

# Creating the object for LDA model using gensim library
lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)

# Results
pprint(ldamodel.print_topics(num_topics=3, num_words=4))

In [None]:
doc = clean("My father driving my sister around to dance practice.")
docrep = dictionary.doc2bow(doc.split())
pprint(ldamodel.get_document_topics(docrep))

In [None]:
pprint(ldamodel[docrep])

In [None]:
pprint(ldamodel[doc_term_matrix[1]])

In [None]:
doc = clean("I like sugar very much")
docrep = dictionary.doc2bow(doc.split())
pprint(ldamodel[docrep])

In [None]:
ldamodel.get_term_topics(dictionary.token2id["sugar"], minimum_probability=0.001)

# LSA Model

In [None]:
lsa = gensim.models.lsimodel.LsiModel

lsamodel = lsa(doc_term_matrix, num_topics=3, id2word = dictionary)

pprint(lsamodel.print_topics(num_topics=3, num_words=4))
pprint(lsamodel.projection.u[dictionary.token2id["father"]])

In [None]:
print(lsamodel.projection.u.shape)
pprint(lsamodel.projection.s[1])

In [None]:
lsamodel.show_topic(0, topn=5)

In [None]:
#print(lsamodel[doc_term_matrix[1]])
doc = clean("I like sugar very much")
docrep = dictionary.doc2bow(doc.split())
print(lsamodel[docrep])

# Possible Improvements
* Filtering
* Part of Speech Tag Filter
* Chunks (Parsing)
* NER

## IDF filtering

In [None]:
dictionary = corpora.Dictionary(doc_clean)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [None]:
tfidf = gensim.models.tfidfmodel.TfidfModel
tfidfmodel = tfidf(doc_term_matrix, id2word = dictionary)
print(tfidfmodel.id2word)
print(tfidfmodel.dfs)

In [None]:
voc = {}
for i in range(len(tfidfmodel.id2word)):
    voc[tfidfmodel.id2word[i]] = tfidfmodel.idfs[i]
print(len(voc), voc)

In [None]:
sel_features=sorted(voc, key=voc.__getitem__, reverse=True)[:40]
print(sel_features)

In [None]:
new_doc_clean = [[w for w in doc if w in sel_features] for doc in doc_clean]
dictionary = corpora.Dictionary(new_doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in new_doc_clean]

# Now you can create new topic models using the selected vocabulary

In [None]:
print(new_doc_clean)

### Testing LDA and LSA again
#### LDA Model

In [None]:
lda = gensim.models.ldamodel.LdaModel
ldamodel = lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)
pprint(ldamodel.print_topics(num_topics=3, num_words=4))

#### LSA Model

In [None]:
lsa = gensim.models.lsimodel.LsiModel
lsamodel = lsa(doc_term_matrix, num_topics=3, id2word = dictionary)

pprint(lsamodel.print_topics(num_topics=3, num_words=4))

In [None]:
print(lsamodel.projection.u.shape)

## Chunking

In [None]:
import nltk.chunk
def conll_tag_chunks(chunk_sents):
    tag_sents = [nltk.chunk.tree2conlltags(tree) for tree in chunk_sents]
    return [[(t, c) for (w, t, c) in chunk_tags] for chunk_tags in tag_sents]

conll_train = nltk.corpus.conll2000.chunked_sents('train.txt')
conll_test = nltk.corpus.conll2000.chunked_sents('test.txt')
train_chunks = conll_tag_chunks(conll_train)
ubt_chunker = nltk.tag.TrigramTagger(train_chunks)

In [None]:
sentence = 'John ate an apple'
pos_tags = nltk.pos_tag(sentence.split())
print(pos_tags)

In [None]:
chunks = ubt_chunker.tag([t for w,t in pos_tags])
print(chunks)

In [None]:
stop = set(stopwords.words('english'))
def clean2(doc):
    punc_free = ''.join(ch for ch in doc if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    stop_free = " ".join([w for w in normalized.lower().split() if w not in stop])
    return stop_free

def get_chunks(docs):
    newdocs=[]
    for doc in docs:
        tags = nltk.pos_tag(clean2(doc).split())
        chunks = ubt_chunker.tag([t for w,t in tags])
        phrase=[]
        for i in range(len(tags)):
            #print(tags[i], chunks[i])
            if chunks[i][1] == None or not chunks[i][1].startswith("I"):
                phrase.append(" "+tags[i][0])
            else:
                phrase.append("_"+tags[i][0])
        newdocs.append("".join(phrase).split())
    return newdocs

get_chunks(['John ate an apple', 'my sister loves a singer a little more than I'])

In [None]:
doc_clean = get_chunks(doc_complete)
print(doc_clean)

### Testing LDA and LSA again

In [None]:
dictionary = corpora.Dictionary(doc_clean)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
print(doc_term_matrix)

#### LDA Model

In [None]:
lda = gensim.models.ldamodel.LdaModel
ldamodel = lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)
pprint(ldamodel.print_topics(num_topics=3, num_words=4))

#### LSA Model

In [None]:
lsa = gensim.models.lsimodel.LsiModel
lsamodel = lsa(doc_term_matrix, num_topics=3, id2word = dictionary)
pprint(lsamodel.print_topics(num_topics=3, num_words=4))

## Intrinsic Evaluation 

### Coherence Models

In [None]:
from gensim.models import CoherenceModel

In [None]:
coherence_model_lda = CoherenceModel(model=ldamodel, texts=doc_clean, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

In [None]:
coherence_model_lsa = CoherenceModel(model=lsamodel, texts=doc_clean, dictionary=dictionary, coherence='c_v')
coherence_lsa = coherence_model_lsa.get_coherence()
print('Coherence Score: ', coherence_lsa)