In [2]:
import pandas as pd
import json
from pandas.io.json import json_normalize
import numpy as np
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

In [2]:
# read the entire file into a python array
with open('../yelp_academic_dataset_business.json') as f:
    data = f.readlines()

# remove the trailing "\n" from each line
data = list(map(lambda x: x.rstrip(), data))

# each element of 'data' is an individual JSON object.
# convert it into an *array* of JSON objects
# which, in and of itself, is one large JSON object

data_json_str = "[" + ','.join(data) + "]"

# now, load it into pandas
yelp_businessdata_df = pd.read_json(data_json_str)

In [3]:
print(data[1])

{"business_id": "UsFtqoBl7naz8AVUBZMjQQ", "full_address": "202 McClure St\nDravosburg, PA 15034", "hours": {}, "open": true, "categories": ["Nightlife"], "city": "Dravosburg", "review_count": 4, "name": "Clancy's Pub", "neighborhoods": [], "longitude": -79.8868138, "state": "PA", "stars": 3.5, "latitude": 40.3505527, "attributes": {"Happy Hour": true, "Accepts Credit Cards": true, "Good For Groups": true, "Outdoor Seating": false, "Price Range": 1}, "type": "business"}


In [4]:
#Filtering yelp business data for restaurants
#and selecting restaurants only within Phoenix (AZ), Las Vegas (NV), Charlotte (NC), Pittsburg(PA)

#define US States two letter code
us_states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA',
            'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA',
            'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM',
            'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD',
            'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']

us_cities = ['Phoenix', 'Las Vegas', 'Charlotte', 'Pittsburgh']

def generate_cityrestaurantdf(cityname):
    index = 0
    attributes = []
    business_id = []
    categories = []
    city = []
    full_address = []
    name = []
    review_count = []
    stars = []
    state = []
    latitude = []
    longitude = []

    #iterate through each row and pick ones that have 'Restaurant' in their category listing
    yelp_businesses_categories = yelp_businessdata_df['categories']
    yelp_businesses_states = yelp_businessdata_df['state']
    yelp_businesses_cities = yelp_businessdata_df['city']
    for category in yelp_businesses_categories:
        usstate = yelp_businesses_states[index]
        uscity = yelp_businesses_cities[index]
        for type in category:
            if "Restaurant" in type and usstate in us_states and (uscity == cityname):
                attributes.append(yelp_businessdata_df['attributes'][index])
                business_id.append(yelp_businessdata_df['business_id'][index])
                categories.append(yelp_businessdata_df['categories'][index])
                city.append(yelp_businessdata_df['city'][index])
                latitude.append(yelp_businessdata_df['latitude'][index])
                longitude.append(yelp_businessdata_df['longitude'][index])
                full_address.append(yelp_businessdata_df['full_address'][index])
                name.append(yelp_businessdata_df['name'][index])
                review_count.append(yelp_businessdata_df['review_count'][index])
                stars.append(yelp_businessdata_df['stars'][index])
                state.append(yelp_businessdata_df['state'][index])
                #print(category)
        index = index + 1

    yelp_restaurant_businesses_df = pd.DataFrame({'attributes': attributes,
                                                  'business_id': business_id,
                                                  'categories': categories,
                                                  'city': city,
                                                  'full_address': full_address,
                                                  'latitude': latitude,
                                                  'longitude': longitude,
                                                  'name': name,
                                                  'review_count': review_count,
                                                  'stars': stars,
                                                  'state': state})

    sorted_yelp_rb_df = yelp_restaurant_businesses_df.sort_values('business_id')
    
    return sorted_yelp_rb_df


phoenix_restaurantdf = generate_cityrestaurantdf('Phoenix')
lasvegas_restaurantdf = generate_cityrestaurantdf('Las Vegas')
charlotte_restaurantdf = generate_cityrestaurantdf('Charlotte')
pittsburg_restaurantdf = generate_cityrestaurantdf('Pittsburgh')

print('Total #restaurant businesses in Phoenix:', len(phoenix_restaurantdf))
print('Total #restaurant businesses in Las Vegas:', len(lasvegas_restaurantdf))
print('Total #restaurant businesses in Charlotte:', len(charlotte_restaurantdf))
print('Total #restaurant businesses in Pittsburgh:', len(pittsburg_restaurantdf))

phoenix_restaurantdf.head()

Total #restaurant businesses in Phoenix: 2922
Total #restaurant businesses in Las Vegas: 4658
Total #restaurant businesses in Charlotte: 1886
Total #restaurant businesses in Pittsburgh: 1401


Unnamed: 0,attributes,business_id,categories,city,full_address,latitude,longitude,name,review_count,stars,state
497,"{'Waiter Service': False, 'Outdoor Seating': F...",-0HGqwlfw3I8nkJyMHxAsQ,"[Burgers, Fast Food, Restaurants]",Phoenix,"4750 E Warner Rd\nPhoenix, AZ 85044",33.331156,-111.981475,McDonald's,9,3.0,AZ
633,"{'Good For': {'breakfast': False, 'brunch': Fa...",-0QBrNvhrPQCaeo7mTo0zQ,[Restaurants],Phoenix,"2526 W Van Buren St\nPhoenix, AZ 85009",33.451391,-112.114419,La Salcita,3,4.5,AZ
742,"{'Waiter Service': True, 'Outdoor Seating': Fa...",-0bUDim5OGuv8R0Qqq6J4A,"[Bakeries, Food, Breakfast & Brunch, Restaurants]",Phoenix,"7023 N 19th Ave\nPhoenix, AZ 85021",33.539689,-112.099183,IHOP,8,2.0,AZ
645,"{'Outdoor Seating': False, 'Order at Counter':...",-1bOb2izeJBZjHC7NWxiPA,"[Breakfast & Brunch, Cafes, American (Traditio...",Phoenix,"61 W. Thomas Road\nPhoenix, AZ 85013",33.479996,-112.077144,First Watch,120,4.0,AZ
111,"{'Open 24 Hours': False, 'Has TV': False, 'Noi...",-3WVw1TNQbPBzaKCaQQ1AQ,"[Chinese, Restaurants]",Phoenix,"302 E Flower St\nPhoenix, AZ 85012",33.485917,-112.069074,China Chili,320,4.0,AZ


In [193]:
#output json files for restaurants in Phoenix (AZ), Las Vegas (NV), Charlotte (NC), Pittsburg (PA)

#sorted_yelp_rb_df.info()


def output_cityrestaurants(city_rb_df, cityname):
    filename = "data/" + cityname + "_Restaurants.json"
    fo = open(filename, "w")

    fo.write("[ ")
    number_restaurants = len(city_rb_df)
    index = 0

    for index in range(0,(number_restaurants-1)):
        fo.write("{ \"business_id\": ")
        fo.write("\""+city_rb_df['business_id'][index]+"\", ")
        fo.write("\"categories\": [")
        categories = city_rb_df['categories'][index]
        for i in range(len(categories)-1):
            fo.write("\""+categories[i]+"\", ")
        fo.write("\""+categories[len(categories)-1]+"\"], ")
        fo.write("\"city\": ")
        fo.write("\""+city_rb_df['city'][index]+"\", ")
        fo.write("\"full_address\": ")
        fo.write("\""+city_rb_df['full_address'][index]+"\", ")
        fo.write("\"latitude\": ")
        fo.write(str(city_rb_df['latitude'][index]))
        fo.write(", \"longitude\": ")
        fo.write(str(city_rb_df['longitude'][index]))
        fo.write(", \"name\": ")
        fo.write("\""+city_rb_df['name'][index]+"\", ")
        fo.write("\"review_count\": ")
        fo.write(str(city_rb_df['review_count'][index]))
        fo.write(", \"stars\": ")
        fo.write(str(city_rb_df['stars'][index]))
        fo.write(", \"state\": ")
        fo.write("\""+city_rb_df['state'][index]+"\"}, ")

    index = index + 1

    fo.write("{ \"business_id\": ")
    fo.write("\""+city_rb_df['business_id'][index]+"\", ")
    fo.write("\"categories\": [")
    categories = city_rb_df['categories'][index]
    for i in range(len(categories)-1):
        fo.write("\""+categories[i]+"\", ")
    fo.write("\""+categories[len(categories)-1]+"\"], ")
    fo.write("\"city\": ")
    fo.write("\""+city_rb_df['city'][index]+"\", ")
    fo.write("\"full_address\": ")
    fo.write("\""+city_rb_df['full_address'][index]+"\", ")
    fo.write("\"latitude\": ")
    fo.write(str(city_rb_df['latitude'][index]))
    fo.write(", \"longitude\": ")
    fo.write(str(city_rb_df['longitude'][index]))
    fo.write(", \"name\": ")
    fo.write("\""+city_rb_df['name'][index]+"\", ")
    fo.write("\"review_count\": ")
    fo.write(str(city_rb_df['review_count'][index]))
    fo.write(", \"stars\": ")
    fo.write(str(city_rb_df['stars'][index]))
    fo.write(", \"state\": ")
    fo.write("\""+city_rb_df['state'][index]+"\"} ")

    fo.write("]")
    fo.close()
    
    return

#output_cityrestaurants(phoenix_restaurantdf, 'Phoenix')
#output_cityrestaurants(lasvegas_restaurantdf, 'LasVegas')
#output_cityrestaurants(charlotte_restaurantdf, 'Charlotte')
#output_cityrestaurants(pittsburg_restaurantdf, 'Pittsburgh')

In [3]:
#file  = "data/LasVegas_Restaurants.json"
#file = file.replace('\\n', ' ')
#data = json.loads('data/LasVegas_Restaurants.json', "r")

test = pd.read_json('data/LasVegas_Restaurants.json')
print(len(test))

ValueError: Expected object or value

In [5]:
# read the entire review json file into a python array
with open('../yelp_academic_dataset_review.json') as f:
    data = f.readlines()

# remove the trailing "\n" from each line

print(len(data))

index = 0
test_strjson1=""
test_strjson2=""
test_strjson3=""
while(index < 741700):
    test_strjson1=test_strjson1 + data[index]
    test_strjson1=test_strjson1 + ','
    index = index + 1

test_strjson1=test_strjson1 + data[index]
test_strjson1 = "[" + test_strjson1 + "]"
    
while(index < 1483400):
    test_strjson2=test_strjson2 + data[index]
    test_strjson2=test_strjson2 + ','
    index = index + 1

test_strjson2=test_strjson2 + data[index]
test_strjson2 = "[" + test_strjson2 + "]"

while(index < 2225212):
    test_strjson3=test_strjson3 + data[index]
    test_strjson3=test_strjson3 + ','
    index = index + 1

test_strjson3=test_strjson3 + data[index]
test_strjson3 = "[" + test_strjson3 + "]"
    
#print(len(test_strjson))
#print(test_strjson)
#print(data[0])

#test_jsondata = pd.read_json(test_strjson)
review1_jsondata = json.loads(test_strjson1)
review2_jsondata = json.loads(test_strjson2)
review3_jsondata = json.loads(test_strjson3)
#print(test_jsondata)

review1_json_df = pd.DataFrame(review1_jsondata)
review2_json_df = pd.DataFrame(review2_jsondata)
review3_json_df = pd.DataFrame(review3_jsondata)
#review1_json_df.head(3)
review2_json_df.head(3)
#review3_json_df.head(3)

#print(len(review1_json_df))
#print(len(review2_json_df))
#print(len(review3_json_df))

2225213


Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,votes
0,N6aDT4tLb3nrecPEQLejTg,2012-03-07,__G6a899C8csvtNvJ118Aw,4,"I like that Razmataz sells unique items, and y...",review,3XuPCUb3J1SoxPbNyl3boA,"{'funny': 0, 'useful': 0, 'cool': 0}"
1,N6aDT4tLb3nrecPEQLejTg,2012-10-26,Z5-UxlpprG3AeqFVQJfC4A,2,Razmataz is going out of business ... to close...,review,0TWVQHr52bJF4KPvKdOXWg,"{'funny': 0, 'useful': 0, 'cool': 0}"
2,GFWo7WJKmhp_VH6C0gnTDw,2013-02-13,5yugXiXPmdlUnNNU1foQGA,5,There are quite a few gas station around where...,review,X1udS3Ad1tmcXw0hv69NLw,"{'funny': 0, 'useful': 2, 'cool': 0}"


In [24]:
#output review json files for restaurants in Phoenix (AZ), Las Vegas (NV), Charlotte (NC), Pittsburg (PA)

#read Phoenix restaurant data

def output_cityreview(cityname):
    jsonfile = "data/" + cityname + "_Restaurants.json"
    city_rb_df = pd.read_json(jsonfile)
    city_rb_businessid = city_rb_df['business_id']

    index = 0
    business_id = []
    review_id = []
    user_id = []
    stars = []
    text = []
    date = []
    votes = []

    def match_citybus_df(city_bus_df, review_df):
        for index in range(0,len(city_bus_df)):
            city_matched_df = return_matchedreview_df(review_df, city_bus_df['business_id'][index])
            city_matched_df = city_matched_df.reset_index(drop=True)
            for i in range(0, len(city_matched_df)):
                    business_id.append(city_matched_df['business_id'][i])
                    date.append(city_matched_df['date'][i])
                    review_id.append(city_matched_df['review_id'][i])
                    stars.append(city_matched_df['stars'][i])
                    text.append(city_matched_df['text'][i])
                    user_id.append(city_matched_df['user_id'][i])
                    votes.append(city_matched_df['votes'][i])         
            #print(len(city_matched_df))
        return

    def return_matchedreview_df(review_df, match_businessid):
        matched_df = review_df.ix[review_df['business_id'] == match_businessid] 
        return matched_df

    match_citybus_df(city_rb_df, review1_json_df)
    match_citybus_df(city_rb_df, review2_json_df)
    match_citybus_df(city_rb_df, review3_json_df)

    city_review_df = pd.DataFrame({'business_id': business_id,
                                      'date': date,
                                      'review_id': review_id,
                                      'stars': stars,
                                      'text': text,
                                      'user_id': user_id,
                                      'votes': votes})

    print(len(city_review_df))
    
    filename = "data/" + cityname + "_ReviewData.json"
    city_review_df.to_json(filename)
    
    return

#output_cityreview('Phoenix')
#output_cityreview('Charlotte')
#output_cityreview('LasVegas')
#output_cityreview('Pittsburgh')

61849


In [6]:
#phoenix_review_df.to_json('data/Phoenix_ReviewData.json')

test_df = pd.read_json('data/Phoenix_TipData.json')
print(len(test_df))
test_df.head(5)

#matched_df = test_df.ix[test_df['business_id'] == "LT0AgHTpDifuHSXgyLmJwg"]
#print(len(matched_df))

54085


Unnamed: 0,business_id,date,likes,text,user_id
0.0,x5Mv61CnZLohZWxfCVCPTQ,2012-09-05,0,Blah...,DBGuQbQy0tPe7Muoawq08Q
1.0,x5Mv61CnZLohZWxfCVCPTQ,2013-07-22,0,Good monday deals,0qIsBt4EzBDCKrIviV55Ew
10.0,KPoTixdjoJxSqRSEApSAGg,2010-06-25,0,Spectacular!,qpplec-ajubZNLQrp8pa0g
100.0,KPoTixdjoJxSqRSEApSAGg,2015-01-16,0,Best holy basil ever!!,nTe10razVBO1Y-jNZoFS6w
1000.0,odhXwWaYZvD_icIN6f_DbA,2011-09-07,0,Great Greek salad,D7WHoCGMd0adl5118eXuPg


In [4]:
# read the entire file into a python array
with open('../yelp_academic_dataset_tip.json') as f:
    data = f.readlines()

# remove the trailing "\n" from each line
data = list(map(lambda x: x.rstrip(), data))

# each element of 'data' is an individual JSON object.
# convert it into an *array* of JSON objects
# which, in and of itself, is one large JSON object

data_json_str = "[" + ','.join(data) + "]"

# now, load it into pandas
yelp_tipdata_df = pd.read_json(data_json_str)

yelp_tipdata_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 591864 entries, 0 to 591863
Data columns (total 6 columns):
business_id    591864 non-null object
date           591864 non-null datetime64[ns]
likes          591864 non-null int64
text           591864 non-null object
type           591864 non-null object
user_id        591864 non-null object
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 31.6+ MB


In [8]:
#output tip data json files for restaurants in Phoenix (AZ), Las Vegas (NV), Charlotte (NC), Pittsburg (PA)

def output_citytipdata(cityname):
    jsonfile = "data/" + cityname + "_Restaurants.json"
    city_rb_df = pd.read_json(jsonfile)
    city_rb_businessid = city_rb_df['business_id']
    
    index = 0
    business_id = []
    date = []
    likes = []
    text = []
    user_id = []
    
    def match_citybus_df(city_bus_df, tip_df):
        for index in range(0,len(city_bus_df)):
            city_matched_df = return_matchedreview_df(tip_df, city_bus_df['business_id'][index])
            city_matched_df = city_matched_df.reset_index(drop=True)
            for i in range(0, len(city_matched_df)):
                    business_id.append(city_matched_df['business_id'][i])
                    date.append(city_matched_df['date'][i])
                    likes.append(city_matched_df['likes'][i])
                    text.append(city_matched_df['text'][i])
                    user_id.append(city_matched_df['user_id'][i])         
            #print(len(city_matched_df))
        return
    
    def return_matchedreview_df(tip_df, match_businessid):
        matched_df = tip_df.ix[tip_df['business_id'] == match_businessid] 
        return matched_df
    
    match_citybus_df(city_rb_df, yelp_tipdata_df)
    
    city_tipdata_df = pd.DataFrame({'business_id': business_id,
                                     'date': date,
                                     'likes': likes,
                                     'text': text,
                                     'user_id': user_id})

    print(len(city_tipdata_df))
    
    filename = "data/" + cityname + "_TipData.json"
    city_tipdata_df.to_json(filename)
    
    return

#output_citytipdata('Phoenix')
output_citytipdata('Charlotte')
output_citytipdata('LasVegas')
output_citytipdata('Pittsburgh')

19715
147744
10181


In [13]:
# read the entire file into a python array
with open('../yelp_academic_dataset_user.json') as f:
    data = f.readlines()

# remove the trailing "\n" from each line
data = list(map(lambda x: x.rstrip(), data))

# each element of 'data' is an individual JSON object.
# convert it into an *array* of JSON objects
# which, in and of itself, is one large JSON object

data_json_str = "[" + ','.join(data) + "]"

# now, load it into pandas
yelp_userdata_df = pd.read_json(data_json_str)

yelp_userdata_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 552339 entries, 0 to 552338
Data columns (total 11 columns):
average_stars    552339 non-null float64
compliments      552339 non-null object
elite            552339 non-null object
fans             552339 non-null int64
friends          552339 non-null object
name             552339 non-null object
review_count     552339 non-null int64
type             552339 non-null object
user_id          552339 non-null object
votes            552339 non-null object
yelping_since    552339 non-null object
dtypes: float64(1), int64(2), object(8)
memory usage: 50.6+ MB


In [38]:
#output user data json files for restaurants reviews/tip data in Phoenix (AZ), Las Vegas (NV), Charlotte (NC), Pittsburg (PA)

def output_cityuserdata(cityname):
    jsonfile = "data/" + cityname + "_ReviewData.json"
    city_reviewdata_df = pd.read_json(jsonfile)
    city_reviewdata_userid = city_reviewdata_df['user_id']
    print(len(city_reviewdata_df))
    
    jsonfile = "data/" + cityname + "_TipData.json"
    city_tipdata_df = pd.read_json(jsonfile)
    city_tipdata_userid = city_tipdata_df['user_id']
    print(len(city_tipdata_df))
    
    index = 0
    average_stars = []
    compliments = []
    elite = []
    fans = []
    friends = []
    name = []
    review_count = []
    user_id = []
    votes = []
    yelping_since = []
    
    def match_userid_df(city_reviewtip_df, userdata_df):
        for index in range(0,len(city_reviewtip_df)):
            city_matched_df = return_matchedreview_df(userdata_df, city_reviewtip_df['user_id'][index])
            city_matched_df = city_matched_df.reset_index(drop=True)
            for i in range(0, len(city_matched_df)):
                    average_stars.append(city_matched_df['average_stars'][i])
                    compliments.append(city_matched_df['compliments'][i])
                    elite.append(city_matched_df['elite'][i])
                    fans.append(city_matched_df['fans'][i])
                    friends.append(city_matched_df['friends'][i])
                    name.append(city_matched_df['name'][i])
                    review_count.append(city_matched_df['review_count'][i])
                    user_id.append(city_matched_df['user_id'][i])
                    votes.append(city_matched_df['votes'][i])
                    yelping_since.append(city_matched_df['yelping_since'][i])
            #print(len(city_matched_df))
            
            if(index%5000 == 0):
                print(index)
        return
    
    def return_matchedreview_df(userdata_df, match_userid):
        matched_df = userdata_df.ix[userdata_df['user_id'] == match_userid] 
        return matched_df
    
    match_userid_df(city_reviewdata_df, yelp_userdata_df)
    match_userid_df(city_tipdata_df, yelp_userdata_df)
    
    city_userdata_df = pd.DataFrame({'average_stars': average_stars,
                                    'compliments': compliments,
                                    'elite': elite,
                                    'fans': fans,
                                    'friends': friends,
                                    'name': name,
                                    'review_count': review_count,
                                    'user_id': user_id,
                                    'votes': votes,
                                    'yelping_since': yelping_since})
    
    print(len(city_userdata_df))
    
    filename = "data/" + cityname + "_UserData_Duplicates.json"
    city_userdata_df.to_json(filename)
    
    #sorted_city_userdata_df = city_userdata_df.sort_values('user_id')
    #print(sorted_city_userdata_df)
    
    return

#output_cityuserdata('Pittsburgh')
#output_cityuserdata('Phoenix')
#output_cityuserdata('Charlott
#output_cityuserdata('LasVegas')

In [36]:
#remove duplicates in the user data files
def remove_duplicates(cityname):
    jsonfile = "data/" + cityname + "_UserData_Duplicates.json"
    city_userdata_df = pd.read_json(jsonfile)
    print(len(city_userdata_df))
    city_userdata_nodup_df = city_userdata_df.drop_duplicates('user_id')
    print(len(city_userdata_nodup_df))
    
    filename = "data/" + cityname + "_UserData.json"
    city_userdata_nodup_df.to_json(filename)
    return

#remove_duplicates('Phoenix')
#remove_duplicates('Pittsburgh')
#remove_duplicates('Charlotte')
#remove_duplicates('LasVegas')