In [118]:
import pandas as pd
import spacy
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

nlp = spacy.load("en")

pd.set_option('display.max_colwidth', None)

In [119]:
def preprocess(text):
    ''' 
    this function does simple text pre-processing such as, 
        - remove stop words
        - remove punctuation
        - convert to lowercase
        - lemmatizaation
    '''
    preprocessed_text = " ".join(simple_preprocess(text))
    preprocessed_text_doc = nlp(preprocessed_text)
    clean_text = " ".join([token.lemma_.strip().lower() for token in preprocessed_text_doc if token.lemma_ != "-PRON-"])
    return clean_text

In [120]:
def chunk_noun_phrases(text):
    ''' 
    this function returns the noun phrase chunk
    '''
    preprocessed_doc = nlp(preprocess(text))
    return [phrase.text for phrase in preprocessed_doc.noun_chunks]

In [121]:
data = pd.read_csv('../data/B005SUHPO6.csv')
data.columns = map(str.lower, data.columns)
data_reviews = data[['reviewtext']]
sample = data_reviews.sample(15)

In [122]:
sample['preprocess_text'] = sample['reviewtext'].map(preprocess)
sample['noun_phrase'] = sample['reviewtext'].map(chunk_noun_phrases)
sample_noun_phrase = sample['noun_phrase'].values.tolist()
# sample_noun_phrase[0]

In [152]:
corpus_dictionary = corpora.Dictionary(sample_noun_phrase)
corpus_dictionary.filter_extremes(no_below=2)
processed_corpus = [corpus_dictionary.doc2bow(doc) for doc in sample_noun_phrase]

In [153]:
number_of_topics = 7
lda = gensim.models.ldamodel.LdaModel
lda_model = lda(corpus=processed_corpus, id2word=corpus_dictionary, num_topics=number_of_topics, passes=3)

In [154]:
lda_model_perplexity = lda_model.log_perplexity(processed_corpus)
coherence_model = CoherenceModel(model=lda_model, texts=sample_noun_phrase, dictionary=corpus_dictionary, coherence='c_v')
lda_model_coherence = coherence_model.get_coherence()

In [155]:
print('\nPerplexit: {} \nCoherence Value: {}'.format(lda_model_perplexity, lda_model_coherence))


Perplexit: -3.126320092138379 
Coherence Value: 0.4258567930689562


In [156]:
for i, t in lda_model.print_topics(-1):
    print('Topic {}: \nWords: {}'.format(i, t))

Topic 0: 
Words: 0.348*"the holster" + 0.251*"the phone" + 0.192*"the otterbox" + 0.056*"iphone" + 0.056*"phone" + 0.056*"don" + 0.007*"the store" + 0.007*"real otterbox" + 0.007*"time" + 0.007*"this case"
Topic 1: 
Words: 0.228*"phone" + 0.158*"this case" + 0.158*"iphone" + 0.158*"time" + 0.158*"the phone" + 0.020*"the store" + 0.020*"the otterbox" + 0.020*"real otterbox" + 0.020*"the case" + 0.020*"the holster"
Topic 2: 
Words: 0.421*"time" + 0.053*"phone" + 0.053*"real otterbox" + 0.053*"the otterbox" + 0.053*"the case" + 0.053*"the store" + 0.053*"the phone" + 0.053*"iphone" + 0.053*"this case" + 0.053*"the color"
Topic 3: 
Words: 0.083*"the store" + 0.083*"the case" + 0.083*"phone" + 0.083*"the otterbox" + 0.083*"this case" + 0.083*"time" + 0.083*"iphone" + 0.083*"real otterbox" + 0.083*"the phone" + 0.083*"the color"
Topic 4: 
Words: 0.221*"iphone" + 0.221*"this case" + 0.221*"the store" + 0.118*"the color" + 0.118*"real otterbox" + 0.015*"phone" + 0.015*"the otterbox" + 0.015*"t

In [157]:
aspect_word_prob = lda_model.print_topics() # topic-word matrix
review_aspect_prob = lda_model[processed_corpus] # document-topic matrix

In [158]:
lda_model.show_topics(num_words=3)

[(0, '0.348*"the holster" + 0.251*"the phone" + 0.192*"the otterbox"'),
 (1, '0.228*"phone" + 0.158*"this case" + 0.158*"iphone"'),
 (2, '0.421*"time" + 0.053*"phone" + 0.053*"real otterbox"'),
 (3, '0.083*"the store" + 0.083*"the case" + 0.083*"phone"'),
 (4, '0.221*"iphone" + 0.221*"this case" + 0.221*"the store"'),
 (5, '0.421*"real otterbox" + 0.053*"phone" + 0.053*"this case"'),
 (6, '0.363*"the case" + 0.319*"phone" + 0.100*"the color"')]

In [159]:
for i in review_aspect_prob:
    print(i)

[(0, 0.07148387), (1, 0.0717331), (2, 0.07142905), (3, 0.07142941), (4, 0.07142871), (5, 0.07142905), (6, 0.57106674)]
[(0, 0.9495139)]
[(0, 0.07142864), (1, 0.07142874), (2, 0.07142902), (3, 0.07142935), (4, 0.071546674), (5, 0.57130885), (6, 0.07142869)]
[(0, 0.047619175), (1, 0.0476929), (2, 0.04761978), (3, 0.047620315), (4, 0.71409404), (5, 0.04761978), (6, 0.04773402)]
[(0, 0.01588142), (1, 0.01588998), (2, 0.015873162), (3, 0.015873266), (4, 0.015884012), (5, 0.015873162), (6, 0.904725)]
[(0, 0.03572584), (1, 0.03574877), (2, 0.035714738), (3, 0.035715066), (4, 0.78539354), (5, 0.03598766), (6, 0.035714407)]
[(0, 0.07142864), (1, 0.07160161), (2, 0.57125396), (3, 0.07142935), (4, 0.0714287), (5, 0.071429014), (6, 0.07142869)]
[(0, 0.7142236), (1, 0.047679864), (2, 0.047619335), (3, 0.047619544), (4, 0.047619138), (5, 0.047619335), (6, 0.047619127)]
[(0, 0.04764357), (1, 0.04778372), (2, 0.04761948), (3, 0.0476198), (4, 0.71409476), (5, 0.04761948), (6, 0.047619164)]
[(0, 0.57141