# Loading the data

In [1]:
import pandas as pd
df_automotive = pd.read_csv('reviews_Automotive_5.csv')

In [2]:
df_automotive['tag'] = 'automotive'

In [3]:
automotive = df_automotive[['reviewText','tag']]

In [4]:
df_beauty = pd.read_csv('reviews_Beauty_5.csv')
df_beauty['tag'] = 'beauty'
beauty = df_beauty[['reviewText','tag']]

In [5]:
df_clothingShoesJewelry = pd.read_csv('reviews_Clothing_Shoes_and_Jewelry_5.csv')
df_clothingShoesJewelry['tag'] = 'clothingShoesJewelry'
clothingShoesJewelry = df_clothingShoesJewelry[['reviewText','tag']]

In [6]:
df_electronics = pd.read_csv('reviews_Electronics_5.csv')
df_electronics['tag'] = 'electronics'
electronics = df_electronics[['reviewText','tag']]

In [7]:
df_homeKitchen = pd.read_csv('reviews_Home_and_Kitchen_5.csv')
df_homeKitchen['tag'] = 'homeKitchen'
homeKitchen = df_homeKitchen[['reviewText','tag']]

In [8]:
reviews = pd.concat([automotive,beauty,clothingShoesJewelry,electronics,homeKitchen],ignore_index=True)

In [9]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
reviewText    49983 non-null object
tag           50000 non-null object
dtypes: object(2)
memory usage: 781.3+ KB


In [34]:
reviews.describe()

Unnamed: 0,reviewText,tag
count,49983,50000
unique,49982,5
top,Good,electronics
freq,2,10000


# Preprocess reviews

In [10]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from stop_words import get_stop_words
from nltk.stem.snowball import SnowballStemmer
from gensim import corpora, models
import gensim

In [11]:
tokenizer = RegexpTokenizer(r'\w+')

In [12]:
nltk_stpwd = stopwords.words('english')
stop_words_stpwd = get_stop_words('en')
merged_stopwords = list(set(nltk_stpwd + stop_words_stpwd))

In [13]:
sb_stemmer = SnowballStemmer('english')

In [30]:
num_reviews = reviews.shape[0]

doc_set = [reviews.reviewText[i] for i in range(num_reviews)]

texts = []

for doc in doc_set:
    if type(doc) is str:
        tokens = tokenizer.tokenize(doc.lower())
        stopped_tokens = [token for token in tokens if not token in merged_stopwords]
        stemmed_tokens = [sb_stemmer.stem(token) for token in stopped_tokens]
        texts.append(stemmed_tokens)

In [25]:
len(texts)

49983

# Transforming tokenized documents into an id-term dictionary

In [16]:
texts_dict = gensim.corpora.Dictionary(texts)
texts_dict.save('reviews.dict')
# Examine each token’s unique id
print(texts_dict)

Dictionary(39692 unique tokens: ['acidiccon', 'afullmark', 'barley', 'becom', 'behind']...)


In [17]:
import operator
texts_dict.filter_extremes(no_below=30, no_above=0.15) # inlace filter
print(texts_dict)
print("top terms:")
print(sorted(texts_dict.token2id.items(), key=operator.itemgetter(1), reverse = False)[:10])

Dictionary(4267 unique tokens: ['becom', 'behind', 'bottl', 'brush', 'clean']...)
top terms:
[('becom', 0), ('behind', 1), ('bottl', 2), ('brush', 3), ('clean', 4), ('complic', 5), ('dirt', 6), ('easi', 7), ('enough', 8), ('even', 9)]


# Creating bag of words

In [18]:
corpus = [texts_dict.doc2bow(text) for text in texts]
len(corpus)

49983

In [19]:
gensim.corpora.MmCorpus.serialize('reviews.mm', corpus)

# Training an LDA model

In [20]:
lda_model = gensim.models.ldamodel.LdaModel(corpus,num_topics=5,id2word=texts_dict, passes=4)

  expElogthetad = np.exp(Elogthetad)
  logger.info("topic diff=%f, rho=%f", np.mean(np.abs(diff)), rho)


In [140]:
lda_model.show_topics(num_topics=5,num_words=10)

[(0,
  'nan*"document" + nan*"discharg" + nan*"vac" + nan*"seldom" + nan*"kudo" + nan*"illustr" + nan*"english" + nan*"card" + nan*"alarm" + nan*"acquir"'),
 (1,
  'nan*"document" + nan*"discharg" + nan*"vac" + nan*"seldom" + nan*"kudo" + nan*"illustr" + nan*"english" + nan*"card" + nan*"alarm" + nan*"acquir"'),
 (2,
  'nan*"document" + nan*"discharg" + nan*"vac" + nan*"seldom" + nan*"kudo" + nan*"illustr" + nan*"english" + nan*"card" + nan*"alarm" + nan*"acquir"'),
 (3,
  'nan*"document" + nan*"discharg" + nan*"vac" + nan*"seldom" + nan*"kudo" + nan*"illustr" + nan*"english" + nan*"card" + nan*"alarm" + nan*"acquir"'),
 (4,
  'nan*"document" + nan*"discharg" + nan*"vac" + nan*"seldom" + nan*"kudo" + nan*"illustr" + nan*"english" + nan*"card" + nan*"alarm" + nan*"acquir"')]

In [None]:
reviews['prediction'] = 5
for r in range(50000):
  doc = reviews.reviewText[r]
  if type(doc) is str:
        tokens = tokenizer.tokenize(doc.lower())
        stopped_tokens = [token for token in tokens if not token in merged_stopwords]
        stemmed_tokens = [sb_stemmer.stem(token) for token in stopped_tokens]
        bow_vector = texts_dict.doc2bow(stemmed_tokens)
        # transform into LDA space
        lda_vector = lda_model[bow_vector]
        reviews.prediction[r] = sorted(lda_vector, key=lambda item: item[1])[-1][0]

In [None]:
reviews.to_csv('reviews_prediction.csv')

# Model evaluation