# Documents

In [None]:
doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father."
doc2 = "My father spends a lot of time driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
doc4 = "Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better."
doc5 = "Health experts say that Sugar is not good for your lifestyle."

# compile documents
doc_complete = [doc1, doc2, doc3, doc4, doc5]

# Cleaning and Preprocessing

In [None]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in doc_complete]

# Document-Term Matrix

In [None]:
# Importing Gensim
import gensim
from gensim import corpora
from pprint import pprint  

# Creating the term dictionary of our courpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

pprint(doc_term_matrix)

# LDA Model

In [None]:
from pprint import pprint  

# Creating the object for LDA model using gensim library
lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = lda(doc_term_matrix, num_topics=5, id2word = dictionary, passes=50)

# Results
pprint(ldamodel.print_topics(num_topics=5, num_words=3))
pprint(ldamodel.get_document_topics(doc_term_matrix[1]))

In [None]:
pprint(ldamodel[doc_term_matrix[1]])

In [None]:
pprint(ldamodel[dictionary.doc2bow("I like sugar very much".split())])

In [None]:
ldamodel.get_term_topics(dictionary.token2id["sugar"], minimum_probability=0.001)

# LSA Model

In [None]:
lsa = gensim.models.lsimodel.LsiModel

lsamodel = lsa(doc_term_matrix, num_topics=3, id2word = dictionary)

pprint(lsamodel.print_topics(num_topics=3, num_words=3))
pprint(lsamodel.projection.u[dictionary.token2id["father"]])
print(lsamodel.projection.u.shape)
pprint(lsamodel.projection.s[1])

In [None]:
lsamodel.show_topic(0, topn=5)

In [None]:
print(lsamodel[doc_term_matrix[1]])

# Possible Improvements
* Filtering
* Part of Speech Tag Filter
* Chunks (Parsing)
* NER

## IDF filtering

In [None]:
tfidf = gensim.models.tfidfmodel.TfidfModel
tfidfmodel = tfidf(doc_term_matrix, id2word = dictionary)
print(tfidfmodel.id2word)
print(tfidfmodel.dfs)

In [None]:
voc = {}
for i in range(len(tfidfmodel.id2word)):
    voc[tfidfmodel.id2word[i]] = tfidfmodel.idfs[i]
    print(tfidfmodel.id2word[i], tfidfmodel.idfs[i])

In [None]:
sel_features=sorted(voc, key=voc.__getitem__, reverse=True)[:40]
print(sel_features)

In [None]:
def select(doc, voc):
    selected = [i for i in doc.split() if i in voc]
    return selected

doc_clean = [select(doc, sel_features) for doc in doc_complete]
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

# Now you can create new topic models using the selected vocabulary

## Chunking

In [None]:
import nltk.chunk
 
def conll_tag_chunks(chunk_sents):
    tag_sents = [nltk.chunk.tree2conlltags(tree) for tree in chunk_sents]
    return [[(t, c) for (w, t, c) in chunk_tags] for chunk_tags in tag_sents]

In [None]:
import nltk.corpus, nltk.tag
 
def ubt_conll_chunk_accuracy(train_sents, test_sents):
    train_chunks = conll_tag_chunks(train_sents)
    test_chunks = conll_tag_chunks(test_sents)
 
    u_chunker = nltk.tag.UnigramTagger(train_chunks)
    print('u:', u_chunker.evaluate(test_chunks))
 
    ub_chunker = nltk.tag.BigramTagger(train_chunks, backoff=u_chunker)
    print('ub:', ub_chunker.evaluate(test_chunks))
 
    ubt_chunker = nltk.tag.TrigramTagger(train_chunks, backoff=ub_chunker)
    print('ubt:', ubt_chunker.evaluate(test_chunks))
 
    ut_chunker = nltk.tag.TrigramTagger(train_chunks, backoff=u_chunker)
    print('ut:', ut_chunker.evaluate(test_chunks))
 
    utb_chunker = nltk.tag.BigramTagger(train_chunks, backoff=ut_chunker)
    print('utb:', utb_chunker.evaluate(test_chunks))
 
# conll chunking accuracy test
conll_train = nltk.corpus.conll2000.chunked_sents('train.txt')
conll_test = nltk.corpus.conll2000.chunked_sents('test.txt')
ubt_conll_chunk_accuracy(conll_train, conll_test)

# treebank chunking accuracy test
# treebank_sents = nltk.corpus.treebank_chunk.chunked_sents()
# ubt_conll_chunk_accuracy(treebank_sents[:2000], treebank_sents[2000:])

In [None]:
conll_train = nltk.corpus.conll2000.chunked_sents('train.txt')
conll_test = nltk.corpus.conll2000.chunked_sents('test.txt')
train_chunks = conll_tag_chunks(conll_train)
ubt_chunker = nltk.tag.TrigramTagger(train_chunks)

In [None]:
sentence = 'John ate an apple'
print(nltk.pos_tag(sentence.split()))

In [None]:
pos=[t for w,t in nltk.pos_tag(sentence.split())]
ubt_chunker.tag(pos)

In [None]:
def clean2(doc):
    punc_free = ''.join(ch for ch in doc if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

tags = [ nltk.pos_tag(clean2(d).split()) for d in doc_complete ]
print("Doc[0]", tags[0])

In [None]:
chunks = [ubt_chunker.tag([t for w,t in d]) for d in tags]
print("chunks[0]", chunks[0])

In [None]:
print([(t,c) for (t,_),(_,c) in zip(tags[0], chunks[0])])

In [None]:
doc_clean=[]
for i in range(len(tags)):
    phrase=[]
    for (t,_),(_,c) in zip(tags[i], chunks[i]):
        if c==None or not c.startswith("I"):
            phrase.append(" "+t)
        else:
            phrase.append("_"+t)
    t = "".join(phrase)
    doc_clean.append(t.split())

In [None]:
doc_clean

## Evaluation for LDA

### Coherence Models

In [None]:
from gensim.models import CoherenceModel

In [None]:
coherence_model_lda = CoherenceModel(model=ldamodel, texts=doc_clean, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

In [None]:
coherence_model_lsa = CoherenceModel(model=lsamodel, texts=doc_clean, dictionary=dictionary, coherence='c_v')
coherence_lsa = coherence_model_lsa.get_coherence()
print('Coherence Score: ', coherence_lsa)