In [None]:
#pull big data from hive and create a dataframe
import pyodbc as py
connection = py.connect('DSN=xxx', autocommit=True)
sql = """
Select chat from [database].[tablename]
"""
df = pd.read_sql_query(sql, connection)
df.head()

In [None]:
df['index'] = df.index
documents = df
print(len(documents))
print(documents[:5])

In [None]:
#Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
#Words that have fewer than 3 characters are removed.
#All stopwords are removed.
#Words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present.
#Words are stemmed — words are reduced to their root form.

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

In [None]:
import nltk
nltk.download('wordnet')

In [None]:
#Lemmatize example
print(WordNetLemmatizer().lemmatize('went', pos='v'))

In [None]:
#stemmer example
stemmer = SnowballStemmer('english')
original_words = ['able', 'right', 'days', 'any', 'much','hello', 'agreed', 'owned', 
           'humbled', 'sized','meeting']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})

In [None]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [None]:
#Select a document to preview after preprocessing.
doc_sample = documents[documents['index'] == 9].values[0][0]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

In [None]:
processed_docs = documents['chat'].map(preprocess)
processed_docs[:10]

In [None]:
#Create a dictionary from ‘processed_docs’ containing the number of times a word appears in the training set
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

In [None]:
#Filter out tokens that appear in less than 15 documents (absolute number) or more than 0.5 documents (fraction of total corpus size, not absolute number).
#after the above two steps, keep only the first 100000 most frequent tokens.
dictionary.filter_extremes(no_below=3, no_above=0.5, keep_n=100000000000)

In [None]:
#For each document we create a dictionary reporting how many words and how many times those words appear. Save this to ‘bow_corpus’, then check our selected document earlier.
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[9]

In [None]:
#Preview Bag Of Words for our sample preprocessed document
bow_doc_9 = bow_corpus[9]

for i in range(len(bow_doc_9)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_9[i][0], 
                                                     dictionary[bow_doc_9[i][0]], 
                                                     bow_doc_9[i][1]))

In [None]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

In [None]:
#Running LDA using Bag of Words, train our lda model using gensim.models.LdaMulticore and save it to ‘lda_model’
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

In [None]:
#Running LDA using TF-IDF
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

In [None]:
#demo the model performance by classifying sample document using LDA TF-IDF
for index, score in sorted(lda_model_tfidf[bow_corpus[15]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))

In [None]:
#Performance evaluation by classifying sample document using LDA Bag of Words model
processed_docs[25]

In [None]:
for index, score in sorted(lda_model[bow_corpus[25]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))

In [None]:
#Performance evaluation by classifying sample document using LDA TF-IDF model
for index, score in sorted(lda_model_tfidf[bow_corpus[25]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))

In [None]:
#Testing model on unseen document
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))