In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
pd.options.plotting.backend = 'plotly'

# Code to read csv file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

#Review Data

##Cleaning

###Import the data from our google drive.

In [None]:
reviews_id = '1itcJ-cExPpHZea7c8hjPMYM6T4l8Z_0w'
downloaded = drive.CreateFile({'id':reviews_id})
downloaded.GetContentFile('filtered-reviews.csv')
reviews = pd.read_csv('filtered-reviews.csv')

###Generate a breif overview of the dataframe.

In [None]:
reviews.describe()

Unnamed: 0,review_id,user_id,business_id,text,date
count,327819,327819,327819,327819,327819
unique,327819,228555,237,327238,327552
top,l3Wk_mvAog6XANIuGQ9C7Q,CfX4sTIFFNaRchNswqhVfg,2BMk_drsikKWslJCXmQtjQ,Omg!There's food was good!Ryan is awesome away...,2014-06-22 00:49:26
freq,1,57,2023,7,3


Notes:


*   All review ids seem to be unique
*   There seem to be around 230k unique users writing reviews
* There are 237 unique businesses represented in the data
* Oddly, there seem to be a few hundred reviews with coppied text
* There are a couple hundred cases where the dates are the same, this is likely due to random change. (Should be verified with a hypothesis test)



###Check for null values

In [None]:
reviews.isna().sum()

review_id      0
user_id        0
business_id    0
text           0
date           0
dtype: int64

Seems like there are no notable nulls in our data.

###Fix Datatypes

In [None]:

reviews.dtypes

review_id      object
user_id        object
business_id    object
text           object
date           object
dtype: object

It looks like everything imported as an object. We will leave most of these untouched, except the date column which needs to be converted to a datetime to become useful.

In [None]:
reviews['date'] = pd.to_datetime(reviews['date'])

In [None]:
reviews.dtypes

review_id              object
user_id                object
business_id            object
text                   object
date           datetime64[ns]
dtype: object

# Language Model

In [None]:
business_series = pd.Series(reviews["business_id"].unique())
random_business = business_series[0]
random_business

'EQ-TZ2eeD_E0BHuvoaeG5Q'

In [None]:
reviews_filtered = reviews[reviews['business_id'] == random_business]

In [None]:
tokenized_reviews = reviews_filtered['text'].apply(lambda doc: nltk.word_tokenize(doc))
tokenized_reviews

0        [Locals, recommended, Milktooth, ,, and, it, '...
13       [Milktooth, is, the, place, to, go, if, you, w...
37       [Busy, place, ,, but, we, were, offered, extra...
62       [Creative, food, and, good, ., We, get, coffee...
82       [Amazing, and, comfortable, atmosphere, with, ...
                               ...                        
41497    [Nice, and, freely, staff, ., Was, a, bit, dis...
41529    [Arrived, at, 1:30pm, ., Walked, around, looki...
41532    [Verdict, :, A, good, brunch, spot, to, try, n...
41533    [For, a, picky, eater, ,, this, place, is, ter...
41535    [So, I, 've, been, here, twice, ..., This, pla...
Name: text, Length: 1421, dtype: object

In [None]:
from nltk.util import ngrams

trigrams = []
for tokens in tokenized_reviews:
    trigrams.extend(list(ngrams(tokens, 3, pad_left=True, pad_right=True)))
trigrams

In [None]:
from nltk.util import ngrams

bigrams = []
for tokens in tokenized_reviews:
    bigrams.extend(list(ngrams(tokens, 2, pad_left=True, pad_right=True)))
bigrams

In [None]:
from nltk.util import ngrams

unigrams = []
for tokens in tokenized_reviews:
    unigrams.extend(list(ngrams(tokens, 1, pad_left=True, pad_right=True)))
unigrams

In [None]:
from collections import Counter

unigram_counts = Counter(unigrams)
total_unigrams = sum(unigram_counts.values())
unigram_probabilities = {
    unigram: count / total_unigrams
    for unigram, count in unigram_counts.items()
}
assert abs(sum(unigram_probabilities.values()) - 1) < 1e-6
unigram_probabilities

In [None]:
from collections import Counter

bigram_counts = Counter(bigrams)
total_bigrams = sum(bigram_counts.values())
bigram_probabilities = {
    # might need to change to the count of bigram / count of first word in current bigram
    # bigram: count /
    bigram: count / total_bigrams
    for bigram, count in bigram_counts.items()
}
assert abs(sum(bigram_probabilities.values()) - 1) < 1e-6
bigram_probabilities

In [None]:
from collections import Counter

trigram_counts = Counter(trigrams)
total_trigrams = sum(trigram_counts.values())
trigram_probabilities = {
    # might need to chang to count of trigram / count of beginning bigram in trigram
    trigram: count / total_trigrams
    for trigram, count in trigram_counts.items()
}
assert abs(sum(trigram_probabilities.values()) - 1) < 1e-6
trigram_probabilities

In [None]:
token_list = []
for review in tokenized_reviews:
  for word in review:
    token_list.append(word)
token_list

In [None]:
class UnigramLM(object):

    def __init__(self, tokens):

        self.mdl = self.train(tokens)

    def train(self, tokens):
        out = pd.Series(tokens).value_counts()
        return out / out.sum()

    def probability(self, words):
        prod = 1
        for token in words:
            if token in self.mdl: prod *= self.mdl[token]
            else: return 0
        return prod

    def sample(self, M):
        return ' '.join(
            [
                np.random.choice(self.mdl.index, p=self.mdl.to_numpy()) \
                for _ in range(M)
            ])


class NGramLM(object):

    def __init__(self, N, tokens):
        # You don't need to edit the constructor,
        # but you should understand how it works!

        self.N = N

        ngrams = self.create_ngrams(tokens)

        self.ngrams = ngrams
        #print(ngrams)

        self.mdl = self.train(ngrams)
        print(self.mdl)

        if N < 2:
            raise Exception('N must be greater than 1')
        elif N == 2:
            self.prev_mdl = UnigramLM(tokens)
        else:
            self.prev_mdl = NGramLM(N-1, tokens)

    def create_ngrams(self, tokens):
        out = []
        for i in range(0, len(tokens) - (self.N - 1)):
            out.append(tuple(tokens[i:i + self.N]))
        return out


    def train(self, ngrams):
        ngrams = pd.Series(ngrams)
        n1grams = ngrams.apply(lambda x: x[0:self.N - 1])
        #print(n1grams)

        ngram_counts = ngrams.value_counts()
        n1gram_counts = n1grams.value_counts()
        #print(n1gram_counts)

        df = pd.DataFrame().assign(
            ngram = ngrams,
            n1gram = n1grams)
        df['prob'] = df.apply(
            lambda x: ngram_counts[x.ngram] / n1gram_counts[x.n1gram], axis=1)

        df = df.drop_duplicates(subset='ngram', keep='first')

        return df


    def probability(self, words):
        words = tuple(words)
        current = self
        total = 1
        for i in range(0, len(words) - self.N + 1):
            p = words[i:i+self.N]
            p1 = p[0:-1]
            try:
                prob = current.mdl.loc[
                    (current.mdl['ngram'] == p) &
                    (current.mdl['n1gram'] == p1), 'prob'
                ].iloc[0]
            except:
                return 0
            total *= prob

        start_index = self.N - 1
        if len(words) < self.N:
            for _ in range(self.N - len(words) - 1):
                current = current.prev_mdl
            start_index = len(words)

        for i in range(start_index, 0, -1):
            current = current.prev_mdl
            p = words[0:i]
            p1 = p[0:i-1]
            try:
                if len(p1) == 0:
                    prob = current.mdl.loc[p]
                else:
                    prob = current.mdl.loc[
                        (current.mdl['ngram'] == p) &
                        (current.mdl['n1gram'] == p1), 'prob'
                    ].iloc[0]
            except:
                return 0
            total *= prob

        return total


    def sample(self, M):
        # Use a helper function to generate sample tokens of length `length`
        def add_token(words):
            subset = words if len(words) < self.N else words[-self.N + 1:]

            current = self
            for _ in range(self.N - len(subset) - 1):
                current = current.prev_mdl

            # print(current.mdl.set_index('n1gram'))
            # print(subset, words)
            try:
                choices = current.mdl.set_index('n1gram').loc[[tuple(subset)]]
                out = choices['ngram'].sample(1, weights=choices['prob'])[0]
                return words[0:len(words) - len(out) + 1] + out
            except Exception as e:
                print(e)
                return words + ('\x03',)


        def gen_sample(length):
            out = ('\x02',)
            for _ in range(length - 1):
                out = out + ('\x02',) if out[-1] == '\x03' else add_token(out)
            return out + ('\x03',)

        # Transform the tokens to strings
        out = gen_sample(M)
        return ' '.join(out)


token_list2 = token_list[:10]
print(token_list2)

threegram = NGramLM(3, token_list2)
sample = threegram.sample(5)

['Locals', 'recommended', 'Milktooth', ',', 'and', 'it', "'s", 'an', 'amazing', 'jewel']
                              ngram                    n1gram  prob
0  (Locals, recommended, Milktooth)     (Locals, recommended)   1.0
1       (recommended, Milktooth, ,)  (recommended, Milktooth)   1.0
2               (Milktooth, ,, and)            (Milktooth, ,)   1.0
3                      (,, and, it)                  (,, and)   1.0
4                     (and, it, 's)                 (and, it)   1.0
5                      (it, 's, an)                  (it, 's)   1.0
6                 ('s, an, amazing)                  ('s, an)   1.0
7              (an, amazing, jewel)             (an, amazing)   1.0
                      ngram          n1gram  prob
0     (Locals, recommended)       (Locals,)   1.0
1  (recommended, Milktooth)  (recommended,)   1.0
2            (Milktooth, ,)    (Milktooth,)   1.0
3                  (,, and)            (,,)   1.0
4                 (and, it)          (and,)   1.0

In [None]:
token_list2

['Locals',
 'recommended',
 'Milktooth',
 ',',
 'and',
 'it',
 "'s",
 'an',
 'amazing',
 'jewel']

#Clustering

In this section we will be creating a clustering algorithm to understand the various types of reviews within our data and the relative frequencies of each review type we define.

### Preprocessing
1. Find all reviews for a business.
2. Tokenize the reviews and find all unique tokens in the data.
3. Create a matrix of tf-idf vectors for each review.



#### 1. Find all reviews for a business.


In [None]:
business_series = pd.Series(reviews["business_id"].unique())
business_x = business_series[0]
reviews_x = reviews[reviews["business_id"] == business_x].reset_index(drop=True)
reviews_x

NameError: ignored

In [None]:
stopwords = list(set(stopwords.words('english')))
print(stopwords)

['y', 'am', 'have', 'but', 'so', 'ours', "hadn't", "weren't", 'has', 'through', 'i', 'up', 'its', 'of', 'were', 'our', 'don', 'his', 'hers', 'at', 're', "didn't", 'off', 'are', 'my', "you've", 'than', 'is', 'me', 'did', 'why', 'all', 'as', "wouldn't", 'between', 'other', 's', 'very', 'mightn', 'where', 'about', 'having', 'yours', 'just', "should've", 'do', 'down', "mightn't", 'didn', 'because', 'their', "isn't", "shan't", 'again', 'here', "mustn't", 'does', "don't", 've', 'nor', 'not', 'in', 'what', "needn't", 'this', "it's", 'same', "you'll", 'an', 't', "doesn't", 'we', 'own', 'ma', 'if', 'or', "shouldn't", 'above', 'once', 'when', 'both', 'can', 'shan', 'mustn', "couldn't", 'by', 'theirs', 'after', "hasn't", 'to', 'hasn', 'being', 'yourselves', 'won', 'the', 'they', 'her', 'during', 'some', 'd', 'any', 'few', "haven't", 'more', 'until', "she's", 'm', 'these', 'for', 'under', 'which', 'on', 'himself', 'your', 'haven', 'should', 'them', 'too', 'itself', 'couldn', 'needn', 'o', "that'll

In [None]:
#reviews_x.text.replace('|'.join(stopwords), '', regex=True)
reviews_x['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))

0       Locals recommended Milktooth, amazing jewel In...
1       Milktooth place go want good breakfast cocktai...
2       Busy place, offered extra- special coffee wait...
3       Creative food good. We get coffee tea wait sea...
4       Amazing comfortable atmosphere extremely uniqu...
                              ...                        
1416    Nice freely staff. Was bit disappointed learne...
1417    Arrived 1:30pm. Walked around looking find che...
1418    Verdict: A good brunch spot try new things Atm...
1419    For picky eater, place terrifying. While I'm g...
1420    So I've twice... This place gotten lot buzz la...
Name: text, Length: 1421, dtype: object

#### 2. Tokenize the reviews and find all unique tokens in the data.


In [None]:
tokenized_reviews = reviews_x['text'].apply(
    lambda doc: pd.Series(nltk.word_tokenize(doc)).str.lower().to_numpy()
    )

unique_tokens = np.unique(np.concatenate(tokenized_reviews))

tokenized_reviews, unique_tokens

(0       [locals, recommended, milktooth, ,, and, it, '...
 1       [milktooth, is, the, place, to, go, if, you, w...
 2       [busy, place, ,, but, we, were, offered, extra...
 3       [creative, food, and, good, ., we, get, coffee...
 4       [amazing, and, comfortable, atmosphere, with, ...
                               ...                        
 1416    [nice, and, freely, staff, ., was, a, bit, dis...
 1417    [arrived, at, 1:30pm, ., walked, around, looki...
 1418    [verdict, :, a, good, brunch, spot, to, try, n...
 1419    [for, a, picky, eater, ,, this, place, is, ter...
 1420    [so, i, 've, been, here, twice, ..., this, pla...
 Name: text, Length: 1421, dtype: object,
 array(['!', '#', '$', ..., '~explosion', '\u200b', '\u200d'], dtype=object))

#### 3. Create a matrix of tf-idf vectors for each review.

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
#instantiate CountVectorizer()
cv=CountVectorizer()

# this steps generates word counts for the words in your docs
word_count_vector=cv.fit_transform(reviews_x['text'])
word_count_vector.shape

(1421, 7849)

In [None]:
# count matrix
count_vector=cv.transform(reviews_x['text'])

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)


# tf-idf scores
tf_idf_vector=tfidf_transformer.transform(count_vector)

feature_names = cv.get_feature_names_out()

In [None]:
tf_idf_vector.T.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
#get tfidf vector for first document
first_document_vector=tf_idf_vector[0:10]

#print the scores
df = pd.DataFrame(first_document_vector.T.todense(),
                  index=feature_names,
                  columns = [f'r{i}' for i in
                             range(first_document_vector.shape[0])])

#df.sort_values(by=["r6"],ascending=False)
#ser = df.iloc[:,0]
#ser
#ser.drop(labels = ['and'])

In [None]:
reviews_x['text']

0       Locals recommended Milktooth, and it's an amaz...
1       Milktooth is the place to go if you want a goo...
2       Busy place, but we were offered extra- special...
3       Creative food and good. We get coffee or tea w...
4       Amazing and comfortable atmosphere with extrem...
                              ...                        
1416    Nice and freely staff. Was a bit disappointed ...
1417    Arrived at 1:30pm.  Walked around looking to f...
1418    Verdict: A good brunch spot to try new things\...
1419    For a picky eater, this place is terrifying. \...
1420    So I've been here twice... This place has gott...
Name: text, Length: 1421, dtype: object

In [None]:

from sklearn.cluster import KMeans
km = KMeans(n_clusters=8, init='random', max_iter=100, n_init=1, verbose=1)
km.fit(df)
labels = km.predict(df)
clusters = {}
n = 0
for item in labels:
  if n < len(df.columns):
    if item in clusters:
      clusters[item].append(df["r" +str(n)])
    else:
      clusters[item] = [df["r" +str(n)]]
    n +=1
  else:
    break

for item in clusters:
  print("Cluster ", item)
  for i in clusters[item]:
    print( i)

Initialization complete
Iteration 0, inertia 10.0.
Iteration 1, inertia 8.13743692420152.
Iteration 2, inertia 7.496419253127968.
Iteration 3, inertia 7.0854092840625515.
Iteration 4, inertia 6.801376691838655.
Iteration 5, inertia 6.56818283167231.
Iteration 6, inertia 6.472166418498988.
Iteration 7, inertia 6.436518586454136.
Iteration 8, inertia 6.386055915031424.
Iteration 9, inertia 6.257312288886837.
Iteration 10, inertia 6.0680267795632865.
Iteration 11, inertia 6.015720401676517.
Iteration 12, inertia 5.994517875040576.
Iteration 13, inertia 5.9676790493030385.
Iteration 14, inertia 5.961516780360867.
Iteration 15, inertia 5.96012396174175.
Converged at iteration 15: strict convergence.
Cluster  0
00          0.0
000         0.0
00a         0.0
00am        0.0
00pm        0.0
           ... 
zone        0.0
zoo         0.0
zooey       0.0
zoë         0.0
zucchini    0.0
Name: r0, Length: 7849, dtype: float64
00          0.0
000         0.0
00a         0.0
00am        0.0
00pm  

In [None]:
labels

NameError: ignored

##DBSCAN

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_normalized = scaler.fit_transform(df)

dbscan = DBSCAN(eps=0.5, min_samples=5)

dbscan.fit(df_normalized)
labels = dbscan.labels_

clusters = {}
n = 0
for item in labels:
    if n < len(df.columns):
        if item != -1:
            if item in clusters:
                clusters[item].append(df["r" + str(n)])
            else:
                clusters[item] = [df["r" + str(n)]]
        n += 1
    else:
        break

for item in clusters:
    print("Cluster", item)
    for i in clusters[item]:
        print(i)