# Import dependencies and determine working directory

In [1]:
# Import libraries
import json
import os
import pandas as pd

# Import NLP dictionary
import nltk
import string # for punctuation
import re

# Get lemmatizer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

# import tokenizer
from nltk.tokenize import wordpunct_tokenize

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/chriskhoo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# get current directory
dir = os.path.dirname(os.path.abspath('__file__'))

# Load data

In [3]:
# Load review data 
# get file path (generalize for different OS) for reviews
filename_review = os.path.join(dir, '01_raw_data','review.json')

# create a list of reviews
with open(filename_review, encoding="utf8", mode='r') as file:
    reviews = [json.loads(line) for line in file]

# create a pandas data frame from review data 
reviews_df = pd.DataFrame(reviews)

# Load business data 
# get file path (generalize for different OS) for reviews
filename_business = os.path.join(dir, '01_raw_data','business.json')

# create a list of reviews
with open(filename_business, encoding="utf8", mode='r') as file:
    businesses = [json.loads(line) for line in file]

# create a pandas data frame from review data 
businesses_df = pd.DataFrame(businesses)

# Merge dataframes and select US Restaurants

In [4]:
# convert date to a datetime - note stars will be kept as an integer vs category
reviews_df['date'] = pd.to_datetime(reviews_df['date'], format='%Y-%m-%d')

# Extract restaurants 
restaurants_df = businesses_df[ businesses_df['categories'].apply(lambda categories: any(pd.Series(categories).str.contains('Restaurants')) if len(categories)>0 else False)]

# Restrict restaurants to the USA using a bounding box 
restaurants_df = restaurants_df[ (restaurants_df['latitude'] >= 24.7433195) & (restaurants_df['latitude'] <= 49.3457868) & (restaurants_df['longitude'] >= -124.7844079) & (restaurants_df['longitude'] <= -66.9513812)]

# Merge both data frames
joint_df = pd.merge(reviews_df, restaurants_df, on='business_id', suffixes=['_review', '_business'])

# Drop all columns other than review text and review stars

In [5]:
# select only stars_review and text columns 
joint_df = joint_df.loc[:, ['stars_review', 'text']]
print(joint_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2876509 entries, 0 to 2876508
Data columns (total 2 columns):
stars_review    int64
text            object
dtypes: int64(1), object(1)
memory usage: 65.8+ MB
None


# Pre-process review text

In [6]:
# define regex
regex = re.compile('[%s]' % re.escape(string.punctuation))

# define pre-processing function
def pre_process_review( review_text ):
    # Convert all text to lower case, tokenize into list of strings, remove punctuation, and lemmatize
    return [lemmatizer.lemmatize(word) for word in wordpunct_tokenize(regex.sub('', review_text.lower()))]

# Test pre_process_review function 
print( pre_process_review("Hey you! Test this sample review, with this: and this ...; and other punctuations and UPPER case letters! And walk, walking, walks, cats and cacti for lemmatizing.") )

['hey', 'you', 'test', 'this', 'sample', 'review', 'with', 'this', 'and', 'this', 'and', 'other', 'punctuation', 'and', 'upper', 'case', 'letter', 'and', 'walk', 'walking', 'walk', 'cat', 'and', 'cactus', 'for', 'lemmatizing']


In [7]:
# Apply pre processing to review text and store result in a new column
joint_df['processed_review'] = joint_df['text'].apply(lambda review_text: pre_process_review( review_text ))

# Check output
joint_df['processed_review'].head(20)

0     [this, place, is, horrible, we, were, so, exci...
1     [for, being, fairly, fast, food, pei, wei, pro...
2     [i, decided, to, try, it, out, im, celiac, and...
3     [im, not, saying, pei, wei, is, the, best, asi...
4     [sometimes, the, food, is, spot, on, and, deli...
5     [decent, customer, service, but, the, food, wa...
6     [super, clean, restaurant, and, friendly, staf...
7     [found, this, the, other, night, it, is, the, ...
8     [the, staff, here, is, great, and, theyre, nic...
9     [i, had, the, garlic, ginger, broccoli, chicke...
10    [this, review, is, based, upon, consistency, o...
11    [i, love, this, place, id, recommend, it, to, ...
12    [1st, place, is, not, closed, there, wa, an, i...
13    [definitely, not, a, fan, coming, from, orange...
14    [pretty, good, not, great, definitely, overpri...
15    [i, wish, i, could, give, 15, star, nothing, s...
16    [disappointed, that, on, yelp, their, hour, sh...
17    [1st, visit, had, the, lo, meindelish, 2nd

# Drop unprocessed text column

In [8]:
del joint_df['text']

In [9]:
print(joint_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2876509 entries, 0 to 2876508
Data columns (total 2 columns):
stars_review        int64
processed_review    object
dtypes: int64(1), object(1)
memory usage: 65.8+ MB
None


# Save output

In [13]:
# Save df into a csv 
filename_out = os.path.join(dir, '02_processed_data','review_text_stars.csv')
joint_df.to_csv(filename_out, index=False)

In [14]:
# Load df from a csv 
file_path = os.path.join(dir, '02_processed_data','review_text_stars.csv')
joint_df2 = pd.read_csv(file_path, index_col = False)

In [15]:
# run checks to see the data frames are similar
print( 'Saved df shape:', str(joint_df.shape) )
print( 'Loaded df shape:', str(joint_df2.shape) )
print( '-------------')
print( 'Saved df info:')
print( joint_df.info() )
print( '-------------')
print( 'Loaded df info:')
print( joint_df2.info() )

Saved df shape: (2876509, 2)
Loaded df shape: (2876509, 2)
-------------
Saved df info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2876509 entries, 0 to 2876508
Data columns (total 2 columns):
stars_review        int64
processed_review    object
dtypes: int64(1), object(1)
memory usage: 65.8+ MB
None
-------------
Loaded df info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2876509 entries, 0 to 2876508
Data columns (total 2 columns):
stars_review        int64
processed_review    object
dtypes: int64(1), object(1)
memory usage: 43.9+ MB
None
