# Exploring the Yelp Data Set

In [1]:
import pandas as pd
import StringIO

In [2]:
pd.__version__

u'0.23.3'

# Businesses JSON file

Contains 174,000 businesses with various location data.

In [3]:
businesses = pd.read_json('./data/dataset/business.json', lines=True, dtype={})
businesses.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state
0,"4855 E Warner Rd, Ste B9","{u'AcceptsInsurance': True, u'ByAppointmentOnl...",FYWN1wneV18bWNgQjJ2GNg,"[Dentists, General Dentistry, Health & Medical...",Ahwatukee,"{u'Tuesday': u'7:30-17:00', u'Friday': u'7:30-...",1,33.33069,-111.978599,Dental by Design,,85044,22,4.0,AZ
1,3101 Washington Rd,"{u'BusinessParking': {u'garage': False, u'stre...",He-G7vWjzVUysIKrfNbPUQ,"[Hair Stylists, Hair Salons, Men's Hair Salons...",McMurray,"{u'Monday': u'9:00-20:00', u'Tuesday': u'9:00-...",1,40.291685,-80.1049,Stephen Szabo Salon,,15317,11,3.0,PA
2,"6025 N 27th Ave, Ste 1",{},KQPW8lFf1y5BT2MxiSZ3QA,"[Departments of Motor Vehicles, Public Service...",Phoenix,{},1,33.524903,-112.11531,Western Motor Vehicle,,85017,18,1.5,AZ
3,"5000 Arizona Mills Cr, Ste 435","{u'BusinessAcceptsCreditCards': True, u'Restau...",8DShNS-LuFqpEWIp0HxijA,"[Sporting Goods, Shopping]",Tempe,"{u'Monday': u'10:00-21:00', u'Tuesday': u'10:0...",0,33.383147,-111.964725,Sports Authority,,85282,9,3.0,AZ
4,581 Howe Ave,"{u'Alcohol': u'full_bar', u'HasTV': True, u'No...",PfOCPjBrlQAnz__NXj9h_w,"[American (New), Nightlife, Bars, Sandwiches, ...",Cuyahoga Falls,"{u'Monday': u'11:00-1:00', u'Tuesday': u'11:00...",1,41.119535,-81.47569,Brick House Tavern + Tap,,44221,116,3.5,OH


In [4]:
businesses.shape

(174567, 15)

Business data set is 15MB:

In [5]:
businesses.memory_usage()

Index                72
address         1396536
attributes      1396536
business_id     1396536
categories      1396536
city            1396536
hours           1396536
is_open         1396536
latitude        1396536
longitude       1396536
name            1396536
neighborhood    1396536
postal_code     1396536
review_count    1396536
stars           1396536
state           1396536
dtype: int64

# Reviews JSON file

In [41]:
reader = pd.read_json('./data/dataset/review.json', lines=True, chunksize=1000)
num_chunks = 0
last_chunk = None
for chunk in reader:
    num_chunks += 1
    if num_chunks >= 1:
        last_chunk = chunk
        break
last_chunk.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,0W4lkclzZThpx3V65bVgig,0,2016-05-28,0,v0i_UHJMo_hPBq9bxWvW4w,5,"Love the staff, love the meat, love the place....",0,bv2nCi5Qv5vroFiqKGopiw
1,AEx2SYEUJmTxVVB18LlCwA,0,2016-05-28,0,vkVSCC7xljjrAI4UGfnKEQ,5,Super simple place but amazing nonetheless. It...,0,bv2nCi5Qv5vroFiqKGopiw
2,VR6GpWIda3SfvPC-lg9H3w,0,2016-05-28,0,n6QzIUObkYshz4dz2QRJTw,5,Small unassuming place that changes their menu...,0,bv2nCi5Qv5vroFiqKGopiw
3,CKC0-MOWMqoeWf6s-szl8g,0,2016-05-28,0,MV3CcKScW05u5LVfF6ok0g,5,Lester's is located in a beautiful neighborhoo...,0,bv2nCi5Qv5vroFiqKGopiw
4,ACFtxLv8pGrrxMm6EgjreA,0,2016-05-28,0,IXvOzsEMYtiJI0CARmj77Q,4,Love coming here. Yes the place always needs t...,0,bv2nCi5Qv5vroFiqKGopiw


Review data set should be about 357 MB:

In [42]:
# Chunk size # num chunks
(8000 * 9) * 5200 / 1024 / 1024

357

# Join text reviews with location of businesses

In [49]:
# Define batch size, all 5.2M reviews won't fit in memory
batch_size = 100000

# Subset columns to location data, ids, and name only
businesses = businesses[['business_id', 'city', 'latitude', 'longitude', 'name', 'neighborhood', 'postal_code', 'state']]

# Iterate through batches
reader = pd.read_json('./data/dataset/review.json', lines=True, chunksize=batch_size)
last_chunk = None
num_chunks = 0
for chunk in reader:
    
    # Subset review columns and join to location data from business file
    chunk = chunk[['business_id', 'review_id', 'text', 'user_id']]
    chunk = chunk.merge(businesses, on='business_id', how='left')
    
    num_chunks += 1
    if num_chunks >= 1:
        last_chunk = chunk
        break
        
last_chunk.head()

Unnamed: 0,business_id,review_id,text,user_id,city,latitude,longitude,name,neighborhood,postal_code,state
0,0W4lkclzZThpx3V65bVgig,v0i_UHJMo_hPBq9bxWvW4w,"Love the staff, love the meat, love the place....",bv2nCi5Qv5vroFiqKGopiw,Montréal,45.516373,-73.577537,Schwartz's,Plateau-Mont-Royal,H2W 1X9,QC
1,AEx2SYEUJmTxVVB18LlCwA,vkVSCC7xljjrAI4UGfnKEQ,Super simple place but amazing nonetheless. It...,bv2nCi5Qv5vroFiqKGopiw,Montréal,45.523333,-73.594859,Wilensky's,Plateau-Mont-Royal,H2T 2M1,QC
2,VR6GpWIda3SfvPC-lg9H3w,n6QzIUObkYshz4dz2QRJTw,Small unassuming place that changes their menu...,bv2nCi5Qv5vroFiqKGopiw,Montréal,45.472902,-73.588321,Tuck Shop,Sud-Ouest,H4C 1S7,QC
3,CKC0-MOWMqoeWf6s-szl8g,MV3CcKScW05u5LVfF6ok0g,Lester's is located in a beautiful neighborhoo...,bv2nCi5Qv5vroFiqKGopiw,Outremont,45.522144,-73.607076,Lester's Deli,Outremont,H2V 1V1,QC
4,ACFtxLv8pGrrxMm6EgjreA,IXvOzsEMYtiJI0CARmj77Q,Love coming here. Yes the place always needs t...,bv2nCi5Qv5vroFiqKGopiw,Montréal,45.50251,-73.570119,Five Guys,Ville-Marie,H3B 1B9,QC


# Constructing dialect labels

There are a few ways we might construct labels for our dialect classification task:

1. look for a different dataset that includes American English dialects
2. perform clustering to discover regional clusters of similar vocabulary/syntax.
3. choose arbitrary regions as dialects (west coast, northeast, south), and manually apply the dialect to each review
4. use the country, state, or city of the review to represent the dialect.

For this initial baseline classification task, we'll first explore option #4, and then #3 if necessary.

In [76]:
# first, explore the distribution of reviews by state in the data set
num_chunks = 0
counts = pd.Series()
reader = pd.read_json('./data/dataset/review.json', lines=True, chunksize=batch_size)
for chunk in reader:
    
    # Subset review columns and join to location data from business file
    chunk = chunk[['business_id', 'review_id', 'text', 'user_id']]
    chunk = chunk.merge(businesses, on='business_id', how='left')
    
    counts = counts.add(chunk.groupby(['state'])['review_id'].count(), fill_value=0)
    
#     num_chunks += 1
#     if num_chunks >= 10:
#         last_chunk = chunk
#         break
        
counts = counts.sort_values(ascending=False)
counts[counts > 1000]

state
NV     1824442.0
AZ     1627792.0
ON      634366.0
NC      307665.0
OH      243768.0
PA      229850.0
QC      146372.0
WI      109751.0
EDH      47889.0
IL       36467.0
BW       35400.0
SC       10860.0
MLN       1851.0
HLD       1288.0
dtype: float64

This gives us 14 states which we will can use as a proxy for dialects (eliminating any states with less than 1000 reviews). These states are not evenly geographically distributed, and are probably not a great representation of dialect. In future work, we'll explore different representations of dialect mentioned above

# Exploring & cleaning text reviews

In [56]:
pd.options.display.max_colwidth = 1000
print last_chunk.loc[1:10, 'text']

1                                                                                                                                                                                                                                                                                                                                                                                                             Super simple place but amazing nonetheless. It's been around since the 30's and they still serve the same thing they started with: a bologna and salami sandwich with mustard. \n\nStaff was very helpful and friendly.
2                                                                                                          Small unassuming place that changes their menu every so often. Cool decor and vibe inside their 30 seat restaurant. Call for a reservation. \n\nWe had their beef tartar and pork belly to start and a salmon dish and lamb meal for mains. Everything was incredible! I could go on at

# Training a bag of words classifier

In [80]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

# training on just one chunk initially
vect = CountVectorizer(stop_words='english', lowercase=True)
counts = vect.fit_transform(last_chunk['text'])
tfidf_transformer = TfidfTransformer()
tfidf = tfidf_transformer.fit_transform(counts)

# train a model predicting the state from the text
labels_train, labels_test, features_train, features_test = train_test_split(last_chunk['state'], tfidf, test_size=0.20, random_state=42)
trained_model = LogisticRegression().fit(features_train, labels_train)

In [82]:
predictions = trained_model.predict(features_test)
accuracy = metrics.accuracy_score(labels_test, predictions)
print accuracy
# print metrics.confusion_matrix(predictions, labels_test)

0.56505
