In [9]:
import pandas as pd
import numpy as np

from collections import OrderedDict
from datetime import datetime
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances

stop = stopwords.words('english')

In [2]:
def load_data():
    review_df = pd.read_csv("../data/df-raw.csv", low_memory=False)
    return review_df

In [3]:
def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    
    try:
        for word in words.split():
            if word not in stop:
                new_words.append(word)
    except:
        pass
    return ' '.join(new_words)

In [4]:
def data_cleaning(df):
    print("Cleaning Data")
    # Removing \n from date field
    for i in range(len(df['date'])):
        if df['date'][i][0] == '\n':
            df['date'][i] = df['date'][i][1:]

    # Removing emtpy cells
    if len(np.where(pd.isnull(df))) > 2:
        # TODO
        pass

    #Pre-processing Text Reviews
    # Remove Stop Words
    df['reviewContent'] = df['reviewContent'].apply(lambda x: remove_stopwords(x))
    # Remove Punctuations
    tokenizer = RegexpTokenizer(r'\w+')
    df['reviewContent'] = df['reviewContent'].apply(
        lambda x: ' '.join(word for word in tokenizer.tokenize(x)))

    # Lowercase Words
    df['reviewContent'] = df['reviewContent'].apply(
        lambda x: x.lower())
    print("Data Cleaning Complete")
    return df

In [5]:
def feature_engineering(df):
    print("Feature Engineering: Creating New Features")
    
    # Maximum Number of Reviews per day per reviewer
    mnr_df1 = df[['reviewerID', 'date']].copy()
    mnr_df2 = mnr_df1.groupby(by=['date', 'reviewerID']).size().reset_index(name='mnr')
    mnr_df2['mnr'] = mnr_df2['mnr'] / mnr_df2['mnr'].max()
    df = df.merge(mnr_df2, on=['reviewerID', 'date'], how='inner')

    # Review Length
    df['reviewLength'] = df['reviewContent'].apply(lambda x: len(x.split()))

    # Review Deviation
    df['reviewDeviation'] = abs(df['rating'] - df['restaurantRating']) / 4

    # Maximum cosine similarity
    review_data = df

    res = OrderedDict()

    # Iterate over data and create groups of reviewers
    for row in review_data.iterrows():
        if row[1].reviewerID in res:
            res[row[1].reviewerID].append(row[1].reviewContent)
        else:
            res[row[1].reviewerID] = [row[1].reviewContent]

    individual_reviewer = [{'reviewerID': k, 'reviewContent': v} for k, v in res.items()]
    df2 = dict()
    df2['reviewerID'] = pd.Series([])
    df2['Maximum Content Similarity'] = pd.Series([])
    vector = TfidfVectorizer(min_df=0)
    count = -1
    
    for reviewer_data in individual_reviewer:
        count = count + 1
        # Handle Null/single review gracefully -24-Apr-2019
        #tfidf = np.empty(shape=(1,1))
        try:
            tfidf = vector.fit_transform(reviewer_data['reviewContent'])
        except:
            pass
        cosine = 1 - pairwise_distances(tfidf, metric='cosine')

        np.fill_diagonal(cosine, -np.inf)
        max = cosine.max()

        # To handle reviewier with just 1 review
        if max == -np.inf:
            max = 0
        df2['reviewerID'][count] = reviewer_data['reviewerID']
        df2['Maximum Content Similarity'][count] = max

    df3 = pd.DataFrame(df2, columns=['reviewerID', 'Maximum Content Similarity'])

    # left outer join on original datamatrix and cosine dataframe -24-Apr-2019
    df = pd.merge(review_data, df3, on="reviewerID", how="left")

    df.drop(index=np.where(pd.isnull(df))[0], axis=0, inplace=True)
    print("Feature Engineering Complete")
    return df

In [12]:
df = load_data()

In [13]:
df_cleaned = data_cleaning(df)

Cleaning Data
Data Cleaning Complete


In [10]:
df_features = feature_engineering(df_cleaned)

Feature Engineering: Creating New Features




Feature Engineering Complete


In [11]:
df_features.to_csv("../data/df-final.csv", index=False)

In [14]:
mnr_df1 = df_cleaned[['reviewerID', 'date']].copy()

In [21]:
mnr_df1.head()

Unnamed: 0,reviewerID,date
0,bNYesZ944s6IJVowOnB0iA,9/22/2012
1,TRKxLC3y-ZvP45e5iilMtw,9/22/2012
2,0EMm8umAqXZzyhxNpL4M9g,9/19/2012
3,DlwexC7z88ymAzu45skODw,9/6/2012
4,kW2dk1CWihmh3g7k9N2G8A,9/9/2012


In [17]:
mnr_df2 = mnr_df1.groupby(by=['date', 'reviewerID']).size().reset_index(name='mnr')

In [22]:
mnr_df2.head()

Unnamed: 0,date,reviewerID,mnr
0,1/1/2007,cWLSaEabp-BJq6XmovsZIQ,1
1,1/1/2011,yi5W66M-PcMXO_Ft38cE6w,3
2,1/1/2012,WmGor7V3XoOO0nceQNdjKA,1
3,1/10/2008,ZIowdUve-IjIN-EgQIwCLA,1
4,1/10/2012,8D8mjeNaJ3DGhl0WX8copA,2


In [24]:
res = OrderedDict()

In [25]:
# Iterate over data and create groups of reviewers
for row in df_cleaned.iterrows():
    if row[1].reviewerID in res:
        res[row[1].reviewerID].append(row[1].reviewContent)
    else:
        res[row[1].reviewerID] = [row[1].reviewContent]

In [27]:
individual_reviewer = [{'reviewerID': k, 'reviewContent': v} for k, v in res.items()]

In [41]:
vector = TfidfVectorizer(min_df=0)

In [42]:
tet = ['unlike next eaten previous night dish completely recognizable notable best possible incarnation alinea delivers meal created willy wonka rules thrown window recount every course highlight dishes made laugh let know restaurant like place rules followed sanctuary allowed chef freedom go wherever imagination led example beverage choices explained six bowls containing blocks ice resting beds pebbles placed table without word said block hole drilled center made decisions drinking servers came table glass straws filled steelhead roe liquid stoppered either end foam instructed take straws fit perfectly sized hole filled ice melt inhale mixture single slurp quickly loudly could frigid rush roe herbs peach unsure whether fruit flavor came straw melt sweet salty made us giggle perfect start unusual meal life lamb 86 presented narrow plate containing three perfect pieces lamb prepared different manner two plexiglas shields placed center table 86 different garnishes none identified promised fitting accompaniments lamb cut three tender servings three smaller pieces could experience flavor combinations table turned guessing game plucked sides tempted us tried identify chosen every course unusual presentation courses astonished perfection alone hot potato cold potato told one oldest courses still remained alinea menu consisted paraffin bowl filled cold potato soup hot potato piece truffle held aloft skewer went lip bowl pull pin moment two ingredients met toss back soup time warm potato time chill two flavors temperatures came together tongue black truffle explosion served spoon chewed closed lips none delectable juice escaped awed science fictional nature presentations precise combination ingredients eaten exactly right moment one course makes clear dish must eaten served green apple helium balloon ingested upon delivery would fear end collapsing upon eaten skin balloon well string held made apply taffy told press lips balloon take bite inhale mouth filled immediately intense flavor apple helium well course soon talking amazing tasted sounding like mickey minnie mouse final course table cleared asked lift glasses silicone covering rolled servers brought many small bowls two metal jugs sweating condensation three white globes circular holes tops two chefs arrived began spooning contents bowls onto table scattering like culinary jackson pollacks identified freeze dried strawberry english pea buttermilk cream chefs lifted jug announced liquid nitrogen poured contents white globes caused fog rise cover table finally chefs lifted globes made white chocolate smashed upon table revealing goodies within meringues cotton candy miniature jelly donuts condensed honeydew melon took spoons attacked making much dent creation deserved meal end stepped restaurant four hours thirty nine minutes arrived yes meal took four hours thirty nine minutes assure feel much time passed oh going go cliche mode know sometimes said time stood still well time stand still per se let say alinea exist outside time pocket universe', 'purple pig one restaurants wanted try chicago extremely popular restaurant takes reservations called ask long wait would one night dinner told 2 1 2 hours another friend called independently told three hours noticed kitchen open midnight sundays decided wrangle bunch friends go sunday night meeting rather us arriving around 11 15 still wait get seated 20 minutes worth think ordered around dozen dishes five us including turned favorite jlt pork jowl tomato frisee fried duck egg pork fat runny egg yolk mingled perfectly making quite sensual visceral food experience also pig tails braised balsamic grated egg parsley roasted bone marrow herbs pig ear crispy kale pickled cherry peppers fried egg every dish perfectly prepared one friends vegetarian made sure order three dishes could including salt roasted beets whipped goat cheese pistachio vinaigrette good even rest us non vegetarians enjoyed finish think vegetarians eat well purple pig well unless watching others gorge pig puts even though stuffed order panini filled nutella marshmallow cream bananas perfect gooey way end meal torn rating 4 5 food definitely deserves 5 due wait issues think 4 1 2 definitely back assuming spare time long wait peak times go odd hours']

In [43]:
tfidf = vector.fit_transform(tet)

In [44]:
cosine = 1 - pairwise_distances(tfidf, metric='cosine')

In [37]:
np.fill_diagonal(cosine, -np.inf)

In [46]:
cosine.flatten()

array([1.        , 0.10406824, 0.10406824, 1.        ])

In [40]:
individual_reviewer[0]

{'reviewerID': 'bNYesZ944s6IJVowOnB0iA',
 'reviewContent': ['unlike next eaten previous night dish completely recognizable notable best possible incarnation alinea delivers meal created willy wonka rules thrown window recount every course highlight dishes made laugh let know restaurant like place rules followed sanctuary allowed chef freedom go wherever imagination led example beverage choices explained six bowls containing blocks ice resting beds pebbles placed table without word said block hole drilled center made decisions drinking servers came table glass straws filled steelhead roe liquid stoppered either end foam instructed take straws fit perfectly sized hole filled ice melt inhale mixture single slurp quickly loudly could frigid rush roe herbs peach unsure whether fruit flavor came straw melt sweet salty made us giggle perfect start unusual meal life lamb 86 presented narrow plate containing three perfect pieces lamb prepared different manner two plexiglas shields placed center