In [160]:
import pandas as pd
import spacy
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

nlp = spacy.load("en")

pd.set_option('display.max_colwidth', None)

In [161]:
def preprocess(text):
    ''' 
    this function does simple text pre-processing such as, 
        - remove stop words
        - remove punctuation
        - convert to lowercase
        - lemmatizaation
    '''
    preprocessed_text = " ".join(simple_preprocess(text))
    preprocessed_text_doc = nlp(preprocessed_text)
    clean_text = " ".join([token.lemma_.strip().lower() for token in preprocessed_text_doc if token.lemma_ != "-PRON-"])
    return clean_text

In [162]:
def chunk_noun_phrases(text):
    ''' 
    this function returns the noun phrase chunk
    '''
    preprocessed_doc = nlp(preprocess(text))
    return [phrase.text for phrase in preprocessed_doc.noun_chunks]

In [163]:
data = pd.read_csv('../data/B005SUHPO6.csv')
data.columns = map(str.lower, data.columns)
data_reviews = data[['reviewtext']]
sample = data_reviews.sample(15)

In [164]:
sample['preprocess_text'] = sample['reviewtext'].map(preprocess)
sample['noun_phrase'] = sample['reviewtext'].map(chunk_noun_phrases)
sample_noun_phrase = sample['noun_phrase'].values.tolist()
# sample_noun_phrase[0]

In [165]:
corpus_dictionary = corpora.Dictionary(sample_noun_phrase)
corpus_dictionary.filter_extremes(no_below=2)
processed_corpus = [corpus_dictionary.doc2bow(doc) for doc in sample_noun_phrase]

In [173]:
number_of_topics = 7
lda = gensim.models.ldamodel.LdaModel
lda_model = lda(corpus=processed_corpus, id2word=corpus_dictionary, num_topics=number_of_topics, passes=10, random_state=100, chunksize=1000) #random_state=100, chunksize=1000,                                     

In [174]:
lda_model_perplexity = lda_model.log_perplexity(processed_corpus)
coherence_model = CoherenceModel(model=lda_model, texts=sample_noun_phrase, dictionary=corpus_dictionary, coherence='c_v')
lda_model_coherence = coherence_model.get_coherence()

In [175]:
print('\nPerplexit: {} \nCoherence Value: {}'.format(lda_model_perplexity, lda_model_coherence))


Perplexit: -3.174936756576154 
Coherence Value: 0.45920651083621283


In [176]:
for i, t in lda_model.print_topics(-1):
    print('Topic {}: \nWords: {}'.format(i, t))

Topic 0: 
Words: 0.246*"phone" + 0.245*"the otterbox defender" + 0.245*"this case" + 0.209*"the case" + 0.030*"friend" + 0.026*"this product"
Topic 1: 
Words: 0.441*"this product" + 0.441*"the case" + 0.030*"phone" + 0.030*"this case" + 0.030*"the otterbox defender" + 0.030*"friend"
Topic 2: 
Words: 0.167*"phone" + 0.167*"the case" + 0.167*"this case" + 0.167*"this product" + 0.167*"the otterbox defender" + 0.167*"friend"
Topic 3: 
Words: 0.846*"phone" + 0.031*"the case" + 0.031*"the otterbox defender" + 0.031*"this case" + 0.031*"this product" + 0.031*"friend"
Topic 4: 
Words: 0.728*"the otterbox defender" + 0.057*"this product" + 0.054*"this case" + 0.054*"phone" + 0.054*"the case" + 0.054*"friend"
Topic 5: 
Words: 0.747*"friend" + 0.051*"the case" + 0.051*"this case" + 0.051*"phone" + 0.051*"the otterbox defender" + 0.050*"this product"
Topic 6: 
Words: 0.694*"this case" + 0.202*"the case" + 0.026*"phone" + 0.026*"the otterbox defender" + 0.026*"friend" + 0.025*"this product"


In [177]:
aspect_word_prob = lda_model.print_topics() # topic-word matrix
review_aspect_prob = lda_model[processed_corpus] # document-topic matrix

In [178]:
lda_model.show_topics(num_words=3)

[(0, '0.246*"phone" + 0.245*"the otterbox defender" + 0.245*"this case"'),
 (1, '0.441*"this product" + 0.441*"the case" + 0.030*"phone"'),
 (2, '0.167*"phone" + 0.167*"the case" + 0.167*"this case"'),
 (3, '0.846*"phone" + 0.031*"the case" + 0.031*"the otterbox defender"'),
 (4,
  '0.728*"the otterbox defender" + 0.057*"this product" + 0.054*"this case"'),
 (5, '0.747*"friend" + 0.051*"the case" + 0.051*"this case"'),
 (6, '0.694*"this case" + 0.202*"the case" + 0.026*"phone"')]

In [179]:
for i in review_aspect_prob:
    print(i)

[(0, 0.071428746), (1, 0.5714253), (2, 0.071430355), (3, 0.07142878), (4, 0.071429126), (5, 0.07142892), (6, 0.071428746)]
[(0, 0.07152853), (1, 0.07142866), (2, 0.07142939), (3, 0.57132727), (4, 0.07142874), (5, 0.07142872), (6, 0.071428664)]
[(0, 0.047666553), (1, 0.71419126), (2, 0.047620036), (3, 0.04761916), (4, 0.0476193), (5, 0.047619242), (6, 0.047664452)]
[(0, 0.035922244), (1, 0.03571445), (2, 0.035715718), (3, 0.28563005), (4, 0.2857228), (5, 0.03571457), (6, 0.2855802)]
[(0, 0.071553774), (1, 0.07142869), (2, 0.071429595), (3, 0.071428694), (4, 0.07142879), (5, 0.071428776), (6, 0.5713017)]
[(0, 0.023977099), (1, 0.024000034), (2, 0.023811307), (3, 0.19041798), (4, 0.19053939), (5, 0.19057292), (6, 0.35668132)]
[(0, 0.14285715), (1, 0.14285715), (2, 0.14285715), (3, 0.14285715), (4, 0.14285715), (5, 0.14285715), (6, 0.14285715)]
[(0, 0.14285715), (1, 0.14285715), (2, 0.14285715), (3, 0.14285715), (4, 0.14285715), (5, 0.14285715), (6, 0.14285715)]
[(0, 0.14285715), (1, 0.142