### Let's try and connect to the S3 instance (already configured)

Yelp doesn't give us permission to search for objects in their bucket, so we just search for a specific object from todays current date going backwards

In [36]:
import botocore.session
import datetime

def generateYelpDataURL(client, bucket_name):
    dateformat = '%Y%m%d'
    now = datetime.datetime.now()
    day_str = now.strftime(dateformat) # eg '20151008'
    ext = '_businesses.json.gz'
    filename =  day_str + ext
    
    url = client.generate_presigned_url('get_object',
                                       Params={'Bucket': 'yelp-syndication',
                                               'Key':'nychealth/'+filename },
                                       ExpiresIn=3600 # 1 hour in seconds
                                       )
    return url, filename

def downloadURLToFile(url, filename):
    with open(filename, 'wb') as handle:
        response = requests.get(url, stream=True)

        if not response.ok:
            print "Request didn't work, something probably wrong with URL"

        for block in response.iter_content(1024):
            handle.write(block)

In [43]:
bucket_name = 'yelp-syndication/nychealth' #'%2F = / in url conversion
session = botocore.session.get_session()
client = session.create_client('s3')
yelp_url, filename = generateYelpDataURL(client, bucket_name)
print yelp_url

https://yelp-syndication.s3.amazonaws.com/nychealth/20151106_businesses.json.gz?AWSAccessKeyId=AKIAJYCFDQN5JPPAWMLQ&Expires=1446867365&Signature=p1htOBLZd4fyuj8i%2BlMEt6oTNrw%3D


In [42]:
import requests

data_dir = '../foodbornenyc/sources/yelpfiles/'
print "Downloading yelp data to this file: %s" % filename
print "    from URL: %s" % yelp_url
downloadURLToFile(yelp_url, data_dir + filename)

Downloading yelp data to this file: 20151106_businesses.json.gz
    from URL: https://yelp-syndication.s3.amazonaws.com/nychealth/20151106_businesses.json.gz?AWSAccessKeyId=AKIAJYCFDQN5JPPAWMLQ&Expires=1446863860&Signature=qJTMYbbb9BU7j%2FphTSlOGpDX1uA%3D


ConnectionError: ('Connection aborted.', gaierror(8, 'nodename nor servname provided, or not known'))

### Loading in yelp json

Yelp has malformed JSON: after a business, there is no ',' (ie "}  { ")

In [None]:
#stream the file and correct the malformed json so ijson can read it
with open('foodbornenyc/foodbornenyc/data/sources/yelpfiles/yelp_mini_sample.json', 'rb') as yelp:
    with open('foodbornenyc/foodbornenyc/data/sources/yelpfiles/fixed_yelp_mini_sample.json', 'wb') as fixed:
        fixed.write('[') # make it an array of objects
        i=1
        for line in yelp:
            if i==1:
                fixed.write(line.replace('{"business_url"', '{"business_url"')+'\n')
            else:
                fixed.write(line.replace('{"business_url"', ',{"business_url"')+'\n')
            i += 1
        fixed.write("]")
        

In [6]:
import ijson.backends.yajl2 as ijson
from ijson.common import IncompleteJSONError

In [11]:
#make a medium sized dataset
with open('../foodbornenyc/sources/yelpfiles/fixed_yelp_mini_sample.json') as yelp:
    biz_list = ijson.items(yelp,'item') # get the list as a generator
    for biz in biz_list: #
        print dict(biz).keys()
        print
        print dict(dict(biz)['reviews'][0])['created']
        print
        print

['rating', 'time_updated', 'business_url', 'name', 'url', 'reviews', 'phone', 'is_closed', 'review_count', 'id', 'categories', 'location']

2015-10-08


['rating', 'time_updated', 'business_url', 'name', 'url', 'reviews', 'phone', 'is_closed', 'review_count', 'id', 'categories', 'location']

2015-07-25


['rating', 'time_updated', 'business_url', 'name', 'url', 'reviews', 'phone', 'is_closed', 'review_count', 'id', 'categories', 'location']

2015-08-11




In [None]:
with open('../foodbornenyc/sources/yelpfiles/fixed_yelp_mini_sample.json') as yelp:
    biz_list = ijson.items(yelp,'item') # get the list as a generator
    for biz in biz_list: #
        print dict(biz)['id']
        print dict(dict(biz)['reviews'][0])['id']
    
#     parser = ijson.parse(yelp)
#     for prefix, event, value in parser:
#         if 'review' not in prefix:
#             print "P: ", prefix, " , E: ", event, " V: ",value
# #         if (prefix, event, value) == ('','map_key', 'business_url'):
#             print "P: ", prefix, " , E: ", event, " V: ",value
#             print "Business:", parser.next()
                
    #except Exception:
    #    print "P: ", prefix, " , E: ", event, " V: ",value

In [35]:
# find the ordering of the yelp reviews
import json
import datetime
ids = []
with open('../foodbornenyc/sources/yelpfiles/yelp_businesses.json') as yelp:
    for i, b in enumerate(yelp):
        biz = json.loads(b)
        for j, rev in enumerate(biz['reviews']):
            if i == 0 and j==0:
                print "KEYS:"
                print "BIZ: ", biz.keys()
                print "REV: ", rev.keys()
            ids.append(rev['created'])
        if i >= 0: break
            
print zip(ids, sorted(ids, reverse=True))

KEYS:
BIZ:  [u'rating', u'time_updated', u'business_url', u'name', u'url', u'reviews', u'phone', u'categories', u'review_count', u'id', u'is_closed', u'location']
REV:  [u'rating', u'is_selected', u'url', u'text', u'created', u'user', u'id']
[(u'2015-10-23', u'2015-10-23'), (u'2015-10-22', u'2015-10-23'), (u'2015-10-17', u'2015-10-23'), (u'2015-10-21', u'2015-10-23'), (u'2015-09-28', u'2015-10-23'), (u'2015-10-14', u'2015-10-22'), (u'2015-10-10', u'2015-10-22'), (u'2015-09-02', u'2015-10-22'), (u'2015-09-09', u'2015-10-21'), (u'2015-09-08', u'2015-10-21'), (u'2015-09-28', u'2015-10-21'), (u'2015-08-21', u'2015-10-19'), (u'2015-10-07', u'2015-10-19'), (u'2015-09-28', u'2015-10-19'), (u'2015-07-28', u'2015-10-17'), (u'2015-08-13', u'2015-10-17'), (u'2015-10-22', u'2015-10-17'), (u'2015-09-16', u'2015-10-16'), (u'2015-08-08', u'2015-10-15'), (u'2015-07-29', u'2015-10-15'), (u'2015-07-31', u'2015-10-14'), (u'2015-09-29', u'2015-10-14'), (u'2015-10-19', u'2015-10-13'), (u'2015-06-20', u'201

In [None]:
from geopy.geocoders import Nominatim

In [None]:
geoLocator = Nominatim()
loc = geoLocator.geocode("35 - 17 Broadway Astoria NY")
lat = loc.latitude
lon = loc.longitude
print type(lat),lon

In [1]:
# how many restaurants in the real feed?
from time import time
import json
start = time()
with open('../foodbornenyc/sources/yelpfiles/yelp_businesses.json', 'rb') as yelp:
     with open('../foodbornenyc/sources/yelpfiles/yelp_5000_businesses.json', 'wb') as medium:
        lines = 0
        for line in yelp:
            if lines % 100 == 0: print lines
            try:
                json.loads(line)
            except ValueError:
                print "Skipping line %i" % lines
                lines +=1
                continue
            medium.write(line)
            lines +=1
            if lines >= 5000: break
print lines
print time() - start

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
10.4938240051


In [None]:
rate = 100/2.72 #business/min

In [None]:
time = lines/rate
print time
print time/60., "hours approx"

In [None]:
uniform = 100./lines
print uniform

In [None]:
from scipy.stats import bernoulli

In [None]:
flip = bernoulli(uniform)

In [None]:
print flip.rvs(size=1)

In [4]:
from datetime import datetime
for i in range(100):
    print datetime.now()

2015-11-05 08:52:33.493410
2015-11-05 08:52:33.493476
2015-11-05 08:52:33.493516
2015-11-05 08:52:33.493552
2015-11-05 08:52:33.493585
2015-11-05 08:52:33.493615
2015-11-05 08:52:33.493648
2015-11-05 08:52:33.493680
2015-11-05 08:52:33.493710
2015-11-05 08:52:33.493742
2015-11-05 08:52:33.493774
2015-11-05 08:52:33.493802
2015-11-05 08:52:33.493835
2015-11-05 08:52:33.493867
2015-11-05 08:52:33.493899
2015-11-05 08:52:33.493930
2015-11-05 08:52:33.493961
2015-11-05 08:52:33.493992
2015-11-05 08:52:33.494020
2015-11-05 08:52:33.494051
2015-11-05 08:52:33.494083
2015-11-05 08:52:33.494115
2015-11-05 08:52:33.494146
2015-11-05 08:52:33.494177
2015-11-05 08:52:33.494208
2015-11-05 08:52:33.494236
2015-11-05 08:52:33.494266
2015-11-05 08:52:33.494298
2015-11-05 08:52:33.494331
2015-11-05 08:52:33.494363
2015-11-05 08:52:33.494394
2015-11-05 08:52:33.494424
2015-11-05 08:52:33.494452
2015-11-05 08:52:33.494482
2015-11-05 08:52:33.494514
2015-11-05 08:52:33.494546
2015-11-05 08:52:33.494576
2

In [15]:
import cProfile
import StringIO
import pstats
import contextlib