In [1]:
import pandas as pd
import re
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
from nltk import pos_tag
import gensim

pd.set_option('display.max_colwidth', None)

In [2]:
data = pd.read_csv('../data/B0090AAOUW.csv')
data.columns = map(str.lower, data.columns)
data_reviewtext = data[['reviewtext']].copy()
data_reviewtext['index'] = data_reviewtext.index
data.head(3)

Unnamed: 0,unnamed: 0,reviewerid,asin,reviewername,helpful,reviewtext,overall,summary,unixreviewtime,reviewtime
0,1961951,A2UATTW1F1O6XK,B0090AAOUW,2know,"[8, 9]","This is a 3G phone, simple and I am loving it. I ordered this phone from lolbuy, I purchase this phone on April 19, 2013. I order this phone and request this phone in white. All I can say that I received my phone on April 29, 2013. This not bad at all for to say that this phone came to the US all the way from Hong Kong. I received my phone in White, and I really love this phone. Everyone really need to order from this seller. Awesome shipping, quick response to email. Outstanding Seller. I really just love my phone, not complaints and i really recommend this phone and seller to a friend",5.0,AWesome Seller and Awesome Phone,1367971200,"05 8, 2013"
1,1961952,A9JWKFBB8FZT9,B0090AAOUW,Abunya Moses,"[4, 5]","The only reason why i am rating this phone a 4 star is because i am waiting to see how it continues to maintain its current excellent status---other wise, its a 5 star for me. I ordered this phone locally in the US. It took four days to prepare the shipping but once the shipping left the seller facility, i got it within two days. I have had it for just over a week but i can testify so far that its an excellent phone. I wouldn't advice anyone to go waste precious money on a Samsung note. This phone does it well. The camera is good. Internet is fast, depending on your network provider--and it uses a Sim card. Mine is Ultra mobile which uses T-Mobile network but its perfect. Sound is good and features are excellent. The front camera could be a little better but most front cameras are like that on most phones anyway. The back camera is above average and clean! I am definitely loving it and will be happy to answer anyone's question related to this phone because although i have had it for a little more than a week, i am quick with exploring phone features and i already tested all the features. I have good information to share. Its not only insanely cheap, it is a great phone--Go for it surely!!!!",4.0,Definately the phone to buy. I am glad i went for it finally!!!!!,1372118400,"06 25, 2013"
2,1961953,A1RL0QW0HGNMZY,B0090AAOUW,Adam Andall,"[1, 2]",These phones just keep freezing up all the time. Not reading memory card.Not holding internet connection. will not recommend it to anyone.,2.0,Just a flashy phone. Nothing more,1383523200,"11 4, 2013"


In [3]:
def preprocess_data(text):
#     sentences = []
    for line in text.splitlines():
        sentence = re.sub('<.*?>', '', line) # remove html tags
        sentence = re.sub(r'\d+', '', sentence) # remove nunumeric characters
        sentence = re.sub(r'[!”#$%&()*+,-./:;<=>?@[\]^_`{|}~]', '', sentence) # remove punctuations
        sentence = sentence.strip().lower() # remove leading/ending spaces and case fold to lower
        sentence = remove_stopwords(sentence)
        sentence = lemmatize_words(sentence)
#         sentences.append(sentence)
        
    return sentence

In [4]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

In [5]:
def lemmatize_words(text):
    result = []
    lemmatizer = WordNetLemmatizer()
    wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
    pos_tagged_text = pos_tag(text.split())
    
    for word, pos in pos_tagged_text:
        result.append(lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)))
    
    return result

#     return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

In [6]:
data_reviewtext['preprocess_text'] = data_reviewtext['reviewtext'].map(preprocess_data)
data_reviewtext['preprocess_text']

0                                                                                                                                                                                                                                                                                                                                                                               [g, phone, simple, loving, order, phone, lolbuy, purchase, phone, april, order, phone, request, phone, white, say, receive, phone, april, bad, say, phone, come, u, way, hong, kong, receive, phone, white, really, love, phone, everyone, really, need, order, seller, awesome, shipping, quick, response, email, outstanding, seller, really, love, phone, complaint, really, recommend, phone, seller, friend]
1      [reason, rating, phone, star, wait, see, continue, maintain, current, excellent, statusother, wise, star, order, phone, locally, u, take, four, day, prepare, ship, ship, leave, seller, facility, get, within, two, day, week,

In [9]:
processed_docs = data_reviewtext['preprocess_text']
dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=25, no_above=0.5, keep_n=100000)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 awesome
1 bad
2 come
3 email
4 everyone
5 g
6 love
7 need
8 order
9 purchase
10 really


In [10]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[1]

[(6, 1),
 (8, 1),
 (14, 1),
 (15, 1),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 4),
 (22, 1),
 (23, 1),
 (24, 1),
 (25, 2),
 (26, 3),
 (27, 1),
 (28, 1),
 (29, 3),
 (30, 2),
 (31, 1),
 (32, 1),
 (33, 4),
 (34, 1),
 (35, 1),
 (36, 1),
 (37, 1),
 (38, 2),
 (39, 1),
 (40, 1),
 (41, 2),
 (42, 1),
 (43, 1),
 (44, 1),
 (45, 1),
 (46, 2),
 (47, 1),
 (48, 1),
 (49, 2),
 (50, 1),
 (51, 1),
 (52, 1),
 (53, 1),
 (54, 1),
 (55, 1),
 (56, 2),
 (57, 1),
 (58, 1)]

In [11]:
from gensim import corpora, models
from pprint import pprint

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.1863041189220724),
 (1, 0.1402906827129922),
 (2, 0.093424642884464),
 (3, 0.1752966389968964),
 (4, 0.20661829051601466),
 (5, 0.14532476317316909),
 (6, 0.19214942101249646),
 (7, 0.12476111192493058),
 (8, 0.3403296755865487),
 (9, 0.1306399787719773),
 (10, 0.42832876172569695),
 (11, 0.22282731251365606),
 (12, 0.12928320278781616),
 (13, 0.22483982544815412),
 (14, 0.3919199363159319),
 (15, 0.17977083299853866),
 (16, 0.1537627493446934),
 (17, 0.38342872506654363)]


In [12]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=5, id2word=dictionary, passes=2, workers=2)

In [13]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.023*"one" + 0.020*"case" + 0.020*"card" + 0.020*"love" + 0.017*"sim" + 0.016*"work" + 0.016*"buy" + 0.016*"order" + 0.016*"would" + 0.015*"time"
Topic: 1 
Words: 0.027*"get" + 0.024*"good" + 0.021*"work" + 0.017*"like" + 0.015*"picture" + 0.014*"use" + 0.014*"back" + 0.014*"call" + 0.014*"take" + 0.013*"come"
Topic: 2 
Words: 0.042*"get" + 0.025*"screen" + 0.022*"use" + 0.019*"work" + 0.017*"go" + 0.016*"like" + 0.014*"battery" + 0.013*"card" + 0.013*"buy" + 0.012*"still"
Topic: 3 
Words: 0.027*"work" + 0.023*"get" + 0.022*"one" + 0.019*"good" + 0.019*"would" + 0.017*"battery" + 0.016*"screen" + 0.015*"like" + 0.014*"use" + 0.014*"great"
Topic: 4 
Words: 0.043*"work" + 0.024*"great" + 0.022*"use" + 0.017*"get" + 0.015*"sim" + 0.015*"come" + 0.014*"buy" + 0.014*"talk" + 0.014*"screen" + 0.014*"good"


In [None]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=5, id2word=dictionary, passes=2, workers=4)

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

In [None]:
for index, score in sorted(lda_model[bow_corpus[2]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))