In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import text
from sklearn.decomposition import LatentDirichletAllocation as LDA
import nltk
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from time import time
import csv
import os
import sys

root = os.getcwd()
sys.path.append("{root}/../..".format(root=root))

from utils.stopWords import stopWords

In [10]:
# CONSTANTS
datasetFilepath = "../../data/data.csv"
countTopics = 5
countTopWords = 10 # Only show the top 10 words in a topic
countFeatures = 100
ngramRange = (1, 1)
tokenPattern= r'(?u)\b[A-Za-z]+\b' # Only include letters, remove any numerical characters
maxReviewRating = 3 # Must be number between 1 and 5
maxReviews = 10000
customStopWords = text.ENGLISH_STOP_WORDS.union(stopWords)
minDF = 2 # Shows up in at least 10 documents x
maxDF = 0.95 # Occurs in less than 90% of the documents

In [3]:
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
tokenizer = lambda word: [lemmatizer.lemmatize(t) for t in word]

[nltk_data] Downloading package wordnet to /Users/enaluz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
startTime = time()
corpus = []
with open(datasetFilepath, 'r') as file:
    reader = csv.DictReader(file)
    for index, row in enumerate(reader):
        try:
            if float(row["reviewRating"]) <= maxReviewRating:
                corpus.append(row["reviewContent"])
        except Exception as e:
            print("Catching error: ", e)
            pass

print("Data extraction completed in %f seconds" %
      (time() - startTime))

print("data length: %s \n" % len(corpus))

countSamples = len(corpus)

Data extraction completed in 0.410339 seconds
data length: 4982 



In [5]:
startTime = time()

nltk.download('punkt')

# Create hashmap for slightly faster lookup
customStopWordsHashmap = { k: True for k in customStopWords }

validCorpus = []
for review in corpus:
    validWords = []
    for word in word_tokenize(review):
        if word not in customStopWordsHashmap:
            validWords.append(word)
    validReview = " ".join(validWords)
    validCorpus.append(validReview)

corpus = validCorpus

print("Filtering stop words completed in %f seconds" % (time() - startTime))

[nltk_data] Downloading package punkt to /Users/enaluz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Filtering stop words completed in 7.199851 seconds


In [6]:
startTime = time()

TFVectorizer = CountVectorizer(
    max_df=maxDF,
    min_df=minDF,
    max_features=countFeatures,
    ngram_range=ngramRange, 
    token_pattern=tokenPattern,
    stop_words=customStopWords
)

TF = TFVectorizer.fit_transform(corpus)

featureNames = TFVectorizer.get_feature_names()

print("Vectorization completed in %f seconds" % (time() - startTime))

  'stop_words.' % sorted(inconsistent))


Vectorization completed in 0.438971 seconds


In [7]:
# Fit the model
startTime = time()

print("Fitting the NMF model with countSamples=%d and countFeatures=%d \n" % (countSamples, countFeatures))

LDAModel = LDA(n_components=countTopics).fit(TF)

print("Completed model fitting in %f seconds" % (time() - startTime))

Fitting the NMF model with countSamples=4982 and countFeatures=100 

Completed model fitting in 25.422411 seconds


In [12]:
# Maps the indexes back to the featureName
for index, topic in enumerate(LDAModel.components_):
    print("Topic %d:" % (index + 1))
    print(", ".join([featureNames[i] for i in topic.argsort()[:-countTopWords - 1:-1]]))

Topic 1:
flavor, taste, did, fries, ordered, meal, tasted, dishes, came, cold
Topic 2:
did, asked, said, came, waiter, told, table, ordered, time, wanted
Topic 3:
really, ordered, okay, place, did, lunch, special, got, came, try
Topic 4:
wait, table, minutes, time, waiting, hour, long, waited, service, seated
Topic 5:
great, service, friendly, staff, nice, went, place, ordered, little, dinner
Topic 6:
service, time, did, experience, location, server, bad, know, times, want
Topic 7:
pretty, ve, place, came, stars, time, quite, come, bit, think
Topic 8:
place, better, small, people, high, reviews, little, really, lot, price
Topic 9:
ok, did, better, cooked, best, service, think, maybe, taste, went
Topic 10:
got, bar, drinks, place, friend, nice, area, night, people, service
