In [1]:
from __future__ import division
from dateutil.parser import parse
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import time
import datetime

In [2]:
review_df = pd.read_csv('../data/yelp_training_set_review.csv',
                     converters={'date': parse}).set_index('review_id')

In [3]:
review_df.sample(n=3)

Unnamed: 0_level_0,business_id,date,stars,text,type,user_id,votes_cool,votes_funny,votes_useful
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
44F8TSvTYOPxy_HdbWfT0Q,MJ6_uOT55dq5ALphFqMVkw,2010-07-10,5,I've been a long-time lover of Lenny's for sev...,review,AtnYGQjNY2cCSNWt_LY8nQ,0,0,1
iaBPXWJ7Zlcfq26AB8LpdQ,eGevCRobYnA_HSj60sEWvQ,2011-06-15,5,Pita Jungle is my gold standard for Mediterran...,review,QYXQ_M4-U6ptDMHtcRi9kQ,0,0,0
cBwDd4vVMt6d_cf3dqxHcQ,r3r_bAfa6pZKIhQB82FizQ,2009-04-07,1,"Overhyped!!\n\nYeah, the Chef and manager are...",review,xs_fnNe0hofd7ZOupCjesQ,7,7,15


In [4]:
user_df = pd.read_csv('../data/yelp_training_set_user.csv').set_index('user_id')
user_df.sample(n=3)

Unnamed: 0_level_0,average_stars,name,review_count,type,votes_cool,votes_funny,votes_useful
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
mB7533hZM30GIULkWFBfhQ,3.71,Chris,14,user,4,4,16
iFTGCeal_5GbRxZvgX7hHg,4.0,Ben,7,user,1,0,4
B7VtyrGlLC43FcGQUmNgwQ,3.6,Ray,30,user,11,7,22


In [5]:
business_df = pd.read_csv('../data/yelp_training_set_business.csv').set_index('business_id')
business_df.sample(n=2)

Unnamed: 0_level_0,categories,city,full_address,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
j8NI4Dd9ANuyrMZ8vTN1eA,"Hotels & Travel,Taxis,Transportation",Scottsdale,"Scottsdale, AZ 85251",33.479298,-111.95948,Big Orange Bus,,True,4,4.0,AZ,business
eSEguR3D17fYgVHU3Y6cxQ,"Reflexology,Health & Medical,Massage,Beauty & ...",Scottsdale,"2765 N Scottsdale Rd\nSte 104\nScottsdale, AZ ...",33.478794,-111.925909,Just Relax,,True,22,4.5,AZ,business


In [6]:
# Sort by most reviews
business_df.sort_values(by='review_count', ascending=False)[:3]

Unnamed: 0_level_0,categories,city,full_address,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
hW0Ne_HTHEAgGF1rAdmR-g,"Hotels & Travel,Airports",Phoenix,"3400 E Sky Harbor Blvd\nPhoenix, AZ 85034",33.43475,-112.00644,Phoenix Sky Harbor International Airport,,True,862,3.0,AZ,business
VVeogjZya58oiTxK7qUjAQ,"Pizza,Restaurants",Phoenix,"623 E Adams St\nPhoenix, AZ 85004",33.449233,-112.065458,Pizzeria Bianco,,True,803,4.0,AZ,business
JokKtdXU7zXHcr20Lrk29A,"Bars,Food,Breweries,Pubs,Nightlife,American (N...",Tempe,"1340 E 8th St\nSte 104\nTempe, AZ 85281",33.419451,-111.915926,Four Peaks Brewing Co,,True,735,4.5,AZ,business


In [7]:
# Add more features to review DataFrame
review_df['text'] = review_df['text'].fillna("")
review_df['review_length'] = review_df['text'].apply(len)
review_df[:2]
# Look at the relationship between review length and # of useful votes

Unnamed: 0_level_0,business_id,date,stars,text,type,user_id,votes_cool,votes_funny,votes_useful,review_length
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
fWKvX83p0-ka4JS3dc6E5A,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,0,5,889
IjZ33sJrzXqU-0X6U8NwyA,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0,1345


In [8]:
# Stem the text
tokenizer = WordPunctTokenizer() # splits up a document into words
stemmer = PorterStemmer() # takes each token to it's root form (.e.g., goes ==> go)

stopset = set(stopwords.words('english'))

In [20]:
def tokenize(text):
    return tokenizer.tokenize(text.lower())

def remove_words(arr, min_word_length=3, remove_stopset=True):
    big_words = [w for w in arr if (len(w) >= min_word_length)]
    
    if remove_stopset:
        return [w for w in big_words if w not in stopset]
    else:
        return big_words

def stem_words(arr):
    return [stemmer.stem(word) for word in arr]

def rejoin_text(arr):
    return " ".join([w for w in arr])
# return ";".join(["%s=%s" % (k, v) for k, v in params.items()])

# Pipe: text -> tokenize -> remove_words -> stem_words

In [23]:
review_df['text_no_stopwords'] = review_df['text'].apply(tokenize).apply(remove_words).apply(rejoin_text)
review_df.head()

Unnamed: 0_level_0,business_id,date,stars,text,type,user_id,votes_cool,votes_funny,votes_useful,review_length,text_no_stopwords
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
fWKvX83p0-ka4JS3dc6E5A,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,0,5,889,wife took birthday breakfast excellent weather...
IjZ33sJrzXqU-0X6U8NwyA,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0,1345,idea people give bad reviews place goes show p...
IESLBzqUCLdSzSqm0eCSxQ,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,0,1,76,love gyro plate rice good also dig candy selec...
G-WvGaISbqqaMHlNnByodA,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,0,2,419,rosie dakota love chaparral dog park !!! conve...
1uJFq2r5QfJG_6ExMRCaGw,6ozycU1RpktNG2-1BroVtw,2012-01-05,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0,469,general manager scott petello good egg !!! det...


In [24]:
review_df['review_len_no_stopwords'] = review_df['text_no_stopwords'].apply(len)
review_df.head(3)

Unnamed: 0_level_0,business_id,date,stars,text,type,user_id,votes_cool,votes_funny,votes_useful,review_length,text_no_stopwords,review_len_no_stopwords
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
fWKvX83p0-ka4JS3dc6E5A,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,0,5,889,wife took birthday breakfast excellent weather...,542
IjZ33sJrzXqU-0X6U8NwyA,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0,1345,idea people give bad reviews place goes show p...,707
IESLBzqUCLdSzSqm0eCSxQ,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,0,1,76,love gyro plate rice good also dig candy selec...,50


In [28]:
review_df['review_len_uniq'] = review_df['text_no_stopwords'].apply(tokenize).apply(np.unique).apply(len)
review_df.head(1)

Unnamed: 0_level_0,business_id,date,stars,text,type,user_id,votes_cool,votes_funny,votes_useful,review_length,text_no_stopwords,review_len_no_stopwords,review_len_uniq
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
fWKvX83p0-ka4JS3dc6E5A,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,0,5,889,wife took birthday breakfast excellent weather...,542,67


In [29]:
review_df[:1].text_no_stopwords

review_id
fWKvX83p0-ka4JS3dc6E5A    wife took birthday breakfast excellent weather...
Name: text_no_stopwords, dtype: object