In [4]:
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import yaml
import ast

ModuleNotFoundError: No module named 'yaml'

### Import secret Keys to use with Foursquare API

In [11]:
with open('/Users/ReddingSkinnyRobot/.secrets/foursquare_api.yaml') as f:
    secrets = yaml.load(f)

### Calculate bounding limits from http://boundingbox.klokantech.com/ TSV format

In [1]:
westlimit=-122.459696 #Longitude
southlimit=47.491912  #Latitude
eastlimit=-122.224433 #Longitude
northlimit=47.734145  #Latitude

In [3]:
(southlimit + northlimit)/2

47.6130285

In [13]:
km_east_west = (eastlimit - westlimit) * 97
km_north_south = (northlimit - southlimit) * 111

In [14]:
km_east_west/100*1000 #meters in each grid east west

228.20510999998945

In [15]:
km_north_south/100*1000 #meters in each grid north south

268.8786299999987

Difference between degrees of Longitude is about 60 miles (97km) 

Difference between degrees of Latitude is about 69 miles (111km)

### Define search functions

In [16]:
def populate_search_params(lat, long):
    search_params = dict(
        client_id=secrets['client_id'],
        client_secret=secrets['client_secret'],
        ll = '{}, {}'.format(lat, long),
        intent='browse',
        radius='200', #Meters
        limit='50',
        categoryId='4bf58dd8d48988d1e0931735',#Coffee shop
        llAcc='1',#Accuracy of lat & long in meters
        v='20180113' #Date of current version
        )
    return search_params

In [17]:
def get_venue_data(longitude_group, latitude_group):
    '''
    Takes in lists of longitudes and latitudes and performs a grid search of them, returning a max of 50 coffee
    shops per each intersection formatted as a list of json objects.
    '''
    search_url = 'https://api.foursquare.com/v2/venues/search'
    search_list = []
    for i, long in enumerate(longitude_group):
        for lat in latitude_group:
            search_params = populate_search_params(lat, long)
            try:
                search_resp = requests.get(url=search_url, params=search_params)
                search_data = json.loads(search_resp.text)
                if search_data['meta']['code'] == 403:
                    print('403 error - Exceeded rate limit')
                    print(search_data)
                    return search_list
                search_list.append(search_data)
            except:
                search_list.append((lat, long))
        print('Step {} of {}'.format(i+1, len(longitude_group)))
    return search_list

In [18]:
def write_venue_info_to_file(search_data, filename):
    '''
    Takes in a list of loaded json objects and writes them to a text file.
    '''
    venues = []
    for item in search_data:
        # Checks for empty response
        try:
            if item['response']['venues'] != []:
                for venue in item['response']['venues']:
                    venues.append(venue)
        except:
            continue
    with open(filename, 'a') as f:
        for item in venues:
            f.write("{}\n".format(item))
    print('Done!')

### Generate longitude & latitude grids and divide search into smaller groups to avoid breakage during API GET requests

#### This needs to be broken into three groups to avoid Foursquare's 5000 requests limit

In [19]:
number_of_gridlines = 100
longitude_grid = np.linspace(westlimit, eastlimit, number_of_gridlines)
latitude_group = np.linspace(northlimit, southlimit, number_of_gridlines)

In [20]:
longitude_group1 = longitude_grid[:33]
longitude_group2 = longitude_grid[33:66]
longitude_group3 = longitude_grid[66:]

### Test single GET from venues/search api

In [21]:
test_lat = latitude_group[49]

In [22]:
test_long = longitude_group2[15]

<img src="images/test_venues_GET_map.jpeg">

In [61]:
get_venue_data([test_long], [test_lat])

Step 1 of 1


[{'meta': {'code': 200, 'requestId': '5a6127009fb6b76466b01fb9'},
  'response': {'venues': [{'allowMenuUrlEdit': True,
     'beenHere': {'lastCheckinExpiredAt': 0},
     'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/breakfast_',
        'suffix': '.png'},
       'id': '4bf58dd8d48988d143941735',
       'name': 'Breakfast Spot',
       'pluralName': 'Breakfast Spots',
       'primary': True,
       'shortName': 'Breakfast'}],
     'contact': {'formattedPhone': '(206) 728-2219',
      'phone': '2067282219',
      'twitter': 'biscuitbitch'},
     'hasPerk': False,
     'id': '5510b894498e3dbab70b5d8d',
     'location': {'address': '2303 3rd Ave',
      'cc': 'US',
      'city': 'Seattle',
      'country': 'United States',
      'crossStreet': '3rd Ave & Bell St',
      'distance': 80,
      'formattedAddress': ['2303 3rd Ave (3rd Ave & Bell St)',
       'Seattle, WA 98121',
       'United States'],
      'labeledLatLngs': [{'label': 'display',
        'la

### Search for all venues that match my search params & write them to txt file

#### Instantiate an empty venue search list which the following for loops and GET requests will populate

In [None]:
venue_search_list = []
search_url = 'https://api.foursquare.com/v2/venues/search'

##### Longitude_group 1

In [15]:
search_list = get_venue_data(longitude_group1, latitude_group)

Step 1 of 33
Step 2 of 33
Step 3 of 33
Step 4 of 33
Step 5 of 33
Step 6 of 33
Step 7 of 33
Step 8 of 33
Step 9 of 33
Step 10 of 33
Step 11 of 33
Step 12 of 33
Step 13 of 33
Step 14 of 33
Step 15 of 33
Step 16 of 33
Step 17 of 33
Step 18 of 33
Step 19 of 33
Step 20 of 33
Step 21 of 33
Step 22 of 33
Step 23 of 33
Step 24 of 33
Step 25 of 33
Step 26 of 33
Step 27 of 33
Step 28 of 33
Step 29 of 33
Step 30 of 33
Step 31 of 33
Step 32 of 33
Step 33 of 33


In [32]:
write_venue_info_to_file(search_list, 'seattle_venues.txt')

Done!


##### Longitude_group 2

In [33]:
search_list = get_venue_data(longitude_group2, latitude_group)

Step 1 of 33
Step 2 of 33
Step 3 of 33
Step 4 of 33
Step 5 of 33
Step 6 of 33
Step 7 of 33
Step 8 of 33
Step 9 of 33
Step 10 of 33
Step 11 of 33
Step 12 of 33
Step 13 of 33
Step 14 of 33
Step 15 of 33
Step 16 of 33
Step 17 of 33
Step 18 of 33
Step 19 of 33
Step 20 of 33
Step 21 of 33
Step 22 of 33
Step 23 of 33
Step 24 of 33
Step 25 of 33
Step 26 of 33
Step 27 of 33
Step 28 of 33
Step 29 of 33
Step 30 of 33
Step 31 of 33
Step 32 of 33
Step 33 of 33


In [39]:
write_venue_info_to_file(search_list, 'seattle_venues.txt')

Done!


##### Longitude_group 3

In [62]:
search_list = get_venue_data(longitude_group3, latitude_group)

Step 1 of 34
Step 2 of 34
Step 3 of 34
Step 4 of 34
Step 5 of 34
Step 6 of 34
Step 7 of 34
Step 8 of 34
Step 9 of 34
Step 10 of 34
Step 11 of 34
Step 12 of 34
Step 13 of 34
Step 14 of 34
Step 15 of 34
Step 16 of 34
Step 17 of 34
Step 18 of 34
Step 19 of 34
Step 20 of 34
Step 21 of 34
Step 22 of 34
Step 23 of 34
Step 24 of 34
Step 25 of 34
Step 26 of 34
Step 27 of 34
Step 28 of 34
Step 29 of 34
Step 30 of 34
Step 31 of 34
Step 32 of 34
Step 33 of 34
Step 34 of 34


In [67]:
write_venue_info_to_file(search_list, 'test2.txt')

Done!


### Iterate through file and extract json info into database

In [226]:
with open('seattle_venues.txt') as f:
    all_venues = []
    for item in f:
        all_venues.append(ast.literal_eval(item))

In [231]:
data = json_normalize(all_venues)

### Clean database

In [262]:
columns_to_drop = ['allowMenuUrlEdit', 'beenHere.lastCheckinExpiredAt', 'categories',
       'contact.facebook', 'contact.facebookName', 'contact.facebookUsername',
       'contact.formattedPhone', 'contact.instagram', 'contact.phone',
       'contact.twitter', 'delivery.id', 'delivery.provider.name',
       'delivery.url', 'hasMenu', 'hasPerk', 'location.crossStreet', 'location.distance',
       'location.formattedAddress', 'location.labeledLatLngs', 
       'location.neighborhood', 'menu.anchor', 'menu.externalUrl', 'menu.label',
       'menu.mobileUrl', 'menu.type', 'menu.url', 'referralId', 'specials.count',
       'specials.items', 'stats.checkinsCount', 'venueChains', 'venuePage.id',
       'venueRatingBlacklisted', 'verified', 'location.cc', 'location.country', 'storeId', 'url']

In [314]:
data_with_dropped_columns = data.drop(labels=columns_to_drop, axis=1)

In [315]:
data_filtered = data_with_dropped_columns.drop_duplicates(subset='id').reset_index().drop('index', axis=1)

In [318]:
data_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 870 entries, 0 to 869
Data columns (total 10 columns):
id                     870 non-null object
location.address       746 non-null object
location.city          861 non-null object
location.lat           870 non-null float64
location.lng           870 non-null float64
location.postalCode    795 non-null object
location.state         870 non-null object
name                   870 non-null object
stats.tipCount         870 non-null int64
stats.usersCount       870 non-null int64
dtypes: float64(2), int64(2), object(6)
memory usage: 68.0+ KB


### Define tips functions

In [327]:
tips_params = dict(
    client_id=secrets['client_id'],
    client_secret=secrets['client_secret'],
    limit=500,
    v='20180113')

In [333]:
def get_tips_data(venue_id):
    '''
    Takes in a venue id and performs a GET request to Foursquare's API to retrieve a max of 500 tips formatted
    as a json object. Returns a list of strings of tips.
    '''
    tips_list = []
        tips_url = 'https://api.foursquare.com/v2/venues/{}/tips'.format(venue_id)
        try:
            tips_resp = requests.get(url=tips_url, params=tips_params)
            tips_data = json.loads(tips_resp.text)
            if tips_data['meta']['code'] == 403:
                print('403 error - Exceeded rate limit')
                print(tips_data)
                return tips_list
            for tip in tips_data['response']['tips']['items']:
                tips_list.append(tip['text'])
        except:
            tips_list.append(venue_id)
    return tips_list

### Test single GET from tips api


In [322]:
test_id = data_filtered.iloc[0]['id']

In [338]:
get_tips_data(test_id)

['Absolutely incredible view, in my opinion the best in all of Seattle! Just take some time to get your coffee and relax. You can sit outside or near one of the sliding windows!',
 'Great views of the beach! Lots of places to plug in laptops. Lots of seating, even sit by the fireplace.',
 "Sunday's on alki in the summer are very busy, this place is no exception. Sorely understaffed on a Sunday night. Long wait just to order. 😕",
 'Nicest people work this Starbucks. Best view in town & Relaxing atmosphere.',
 "As a Londoner, this is definitely the best view I've ever seen from a Starbucks window!",
 'A fantastic location with a great atmosphere on a Saturday night.',
 'Really friendly customer service! Loved it!',
 'Nice spot to watch the water while reading a book or working on your laptop.',
 'This location is great! Only thing better would be a Starbucks on the top of the Space Needle.  Like Dr. Evil.',
 "Try the iced passionfruit lemonade. You won't be disappointed.",
 'The big over

### Get tips into database

In [346]:
data_filtered['tips'] = data_filtered.apply(lambda row: get_tips_data(row['id']), axis=1)

In [351]:
data_filtered.to_csv('seattle_coffeeshops_foursquare.csv')

In [18]:
data_filtered = pd.read_csv('data/seattle_coffeeshops_foursquare.csv').drop('Unnamed: 0', axis=1)

### Define functions for Google's API

In [26]:
with open('/Users/ReddingSkinnyRobot/.secrets/google_api.yaml') as f:
    google_secrets = yaml.load(f)

In [130]:
def populate_google_search_url(lat, lng, place_name, api_key):
    '''
    Takes in latitude, longitude, place name (sting), and Google api key, and outputs the applicable api url.
    '''
    google_search_url = 'https://maps.googleapis.com/maps/api/place/nearbysearch/json?location={},{}&radius=5&language=english&keyword={}&key={}'.format(lat, lng, place_name, api_key)
    return google_search_url

In [131]:
def get_google_api_id(lat, lng, name, api_key):
    '''
    Takes in latitude, longitude, name, and Google API key and outputs Google's place id for location.
    '''
    search_url = populate_google_search_url(lat, lng, name, api_key)
    search_resp = requests.get(url=search_url)
    search_data = json.loads(search_resp.text)
    try:
        return search_data['results'][0]['place_id']
    except:
        return None

In [132]:
def populate_google_details_url(google_id, api_key):
    '''
    Takes in place id and Google api key, and outputs the applicable api url.
    '''
    google_details_url = 'https://maps.googleapis.com/maps/api/place/details/json?placeid={}&language=english&key={}'.format(google_id, api_key)
    return google_details_url

In [133]:
def get_google_id_reviews(google_id, api_key):
    '''
    Takes in place id and Google API key and outputs text reviews for locations.
    '''
    details_url = populate_google_details_url(google_id, api_key)
    details_resp = requests.get(url=details_url)
    details_data = json.loads(details_resp.text)
    reviews_list = []
    try:
        for review in details_data['result']['reviews']:
            reviews_list.append(review['text'])
        return reviews_list
    except:
        return reviews_list

### Test one pull from Google's search API

In [145]:
data_filtered.iloc[2]['name'].split()[0]

"Tully's"

In [143]:
test_id = get_google_api_id(data_filtered.iloc[2]['location.lat'], data_filtered.iloc[2]['location.lng'], data_filtered.iloc[2]['name'].split()[0], google_secrets['key'])

In [144]:
test_id

'ChIJJWcEGoZAkFQRjPWwxLTuX-g'

### Test one pull from Google's details API

In [161]:
test_details = get_google_id_reviews(test_id, google_secrets['key'])

In [166]:
test_details

["First of all props for opening on Christmas. Only coffee to be open! The place is cozy and has a nice fireplace for this winter weather. We ordered the white mocha and the black mocha and both were rich and delicious. \n\nThe egg nog latte wasn't as outstanding. \nIt's a good place if you are tired of the same old coffee places in Seattle. \nIt's a good spot to stop for a drink and walk around the beach.",
 "Sort of run down. Good coffee though. TV does not work, Jacks on the wall broken. Lots of little he's in the walls. Some dirty tables.  I think there is a reason why the other coffee shop is always packed.",
 "You're employees are unprofessional, playing with cleaning tools taking selfies, ignoring customers. Inappropriately touching eachother. They didn't even put shots of coffee in one drink I ordered, and didn't put cream in the other. When asked to fix they argued that there was nothing wrong. The female is eating food behind the counter. One employee is not even in uniform o

### Pull from Google's search API to get google_ids

In [170]:
data_filtered['google_ids'] = data_filtered.apply(lambda row: get_google_api_id(row['location.lat'],
                                                                                row['location.lng'],
                                                                                row['name'],
                                                                                google_secrets['key']),
                                                                                axis=1)

### Pull from Google's search API to get reviews

In [173]:
data_filtered['google_reviews'] = data_filtered.apply(lambda row: get_google_id_reviews(row['google_ids'],
                                                                                google_secrets['key']),
                                                                                axis=1)

### Define functions for Yelp's API

In [241]:
with open('/Users/ReddingSkinnyRobot/.secrets/yelp_api.yaml') as f:
    yelp_secrets = yaml.load(f)

In [242]:
def yelp_search_params(lat, lng, name):
    search_params = dict(
        term=name,
        latitude=lat,
        longitude=lng,
        radius=20,
        limit=1,
        )
    return search_params

In [243]:
def get_yelp_api_id(lat, lng, name, key):
    '''
    Takes in latitude, longitude, name, and Yelp's API key and outputs Yelp's id for location.
    '''
    yelp_search_url = 'https://api.yelp.com/v3/businesses/search'
    search_params = yelp_search_params(lat, lng, name)
    search_resp = requests.get(url=yelp_search_url, params=search_params, 
                               headers={'Authorization':'Bearer {}'.format(key)})
    search_data = json.loads(search_resp.text)
    try:
        return search_data['businesses'][0]['id']
    except:
        return None

In [244]:
def populate_yelp_reviews_url(yelp_id):
    '''
    Takes in Yelp id and outputs the applicable api url.
    '''
    yelp_reviews_url = 'https://api.yelp.com/v3/businesses/{}/reviews'.format(yelp_id)
    return yelp_reviews_url

In [245]:
def get_yelp_id_reviews(yelp_id, api_key):
    '''
    Takes in Yelp id and Yelp API key and outputs text reviews for locations.
    '''
    yelp_reviews_url = populate_yelp_reviews_url(yelp_id)
    reviews_resp = requests.get(url=yelp_reviews_url, params={'locale':'en_US'},
                               headers={'Authorization':'Bearer {}'.format(api_key)})
    reviews_data = json.loads(reviews_resp.text)
    #return reviews_data
    reviews_list = []
    try:
        for review in reviews_data['reviews']:
            reviews_list.append(review['text'])
        return reviews_list
    except:
        return reviews_list

### Test one GET request on Yelp's search API

In [206]:
yelp_test_id = get_yelp_api_id(data_filtered.iloc[2]['location.lat'], 
                data_filtered.iloc[2]['location.lng'], 
                data_filtered.iloc[2]['name'],
                yelp_secrets['api_key'])

In [207]:
yelp_test_id

'tullys-coffee-seattle-9'

### Test one GET request on Yelp's reviews API

In [220]:
test_reviews = get_yelp_id_reviews(yelp_test_id, yelp_secrets['api_key'])

In [221]:
test_reviews

["After walking around the beach looking for coffee I was so pleased to find Tully's Coffee. My drink was so well made and soooooooooooo well needed. The...",
 '15 to 20 minute wait for a Blended Mocha?\nAt this point it should be just complimentary',
 "I'm writing this with sadness. I love Tully's. I use to come in nearly every day. But I feel like once the old manager left, everything went downhill...."]

### Pull from Yelp's search API to get ids

In [241]:
data_filtered['yelp_ids'] = data_filtered.apply(lambda row: get_yelp_api_id(row['location.lat'], 
                                                                            row['location.lng'], 
                                                                            row['name'], 
                                                                            yelp_secrets['api_key']), 
                                                axis=1)

### Pull from Yelp's reviews API to get reviews

In [244]:
data_filtered['yelp_reviews'] = data_filtered.apply(lambda row: get_yelp_id_reviews(row['yelp_ids'], 
                                                                                    yelp_secrets['api_key']), 
                                                    axis=1)

### Clean and combine text data

In [278]:
import string

In [314]:
translator = str.maketrans('', '', string.punctuation)

In [378]:
data_filtered['tips_clean'] = data_filtered.apply(lambda row: row['tips'].translate(translator).lower().replace('\n',' '), axis=1)

In [379]:
data_filtered['google_reviews_clean'] = data_filtered.apply(lambda row: ' '.join(row['google_reviews']).translate(translator).lower().replace('\n',' '), axis=1)

In [380]:
data_filtered['yelp_reviews_clean'] = data_filtered.apply(lambda row: ' '.join(row['yelp_reviews']).translate(translator).lower().replace('\n',' '), axis=1)

In [381]:
data_filtered['combined_reviews'] = data_filtered.apply(lambda row: ' '.join([row['tips_clean'], 
                                                                              row['google_reviews_clean'],
                                                                              row['yelp_reviews_clean']]),
                                                       axis=1)

In [382]:
data_filtered.to_csv('seattle_coffeeshops_combine_reviews.csv')

In [383]:
data_filtered['combined_reviews'][842]

'ladies restroom locks funny make sure the button is pushed in all the way good selection of beers too coffee ftw nice working setup write a novel it worked for me drink coffee all day then get a beer a pcc across the street caffe vita is an excellent coffee shop the croissants and old fashioned donuts are excellent and the coffee is tasty the hot chocolate is very flavorful and its overall a great chain great latte sweet vibes quiet place to work knowledgeable and friendly staff it is part of my job to help businesses with certain building repair many places have proven to be quite rude to my industry this place was very friendly treated me like a human and the atmosphere was very cozy i get to see the kitchens in these places this cafe has an enormous kitchen witch also proved to be very clean i was quite impressed great relaxing vibe with lovely baristas this is a great place to get work done it feels big and open unlike the typical claustrophobic seattle coffee shops for noncoffeed

In [146]:
data_filtered = pd.read_csv('data/seattle_coffeeshops_combine_reviews.csv')

In [147]:
data_filtered['combined_reviews'][600]

' clean and friendly atmosphere i love sitting next to the electric fireplace and reading the paper there also a nice bar along the windows with outlets if you need a quite and chill place to do some work they have great coffee they use sleepy monk roasters it’s a cozy spot i like the style of the space  it has an open modern feel with some retro accents and lots of light  and the coffee just might be the best ive had anywhere i am so glad  that i gave this place a try i only drink latte and am picky about it i can honestly say this place has the best latte in town great flavor and served at perfect temperature  cute spot tucked away off broadway half a block down from the qfc and library  they just opened so theyre not as packed as some of the other cafes around which means youre more likely to find a seat  great coffee friendly baristas refreshingly unpretentious  there are outlets for laptops in the cozy brightly decorated space  they use a roaster from cannon beach oregon called sl

### Filtering through data

In [148]:
#Checking for any city's not in Seattle
isnull_city_lat_lng = data_filtered[data_filtered['location.city'].isnull()][['location.lat', 'location.lng']]
#All of these appear to be in Seattle

In [149]:
data_filtered['location.city'] = data_filtered['location.city'].fillna(value='Seattle')

In [150]:
data_filtered['location.city'].unique()

array(['Seattle', 'West Seattle', 'Burien', 'Vancouver', 'Shoreline',
       'Tukwila', 'Renton', 'University Village', 'Kirkland', 'unknown',
       'Bothell', 'Inglewood-Finn Hill', 'Mercer Island', 'Clyde Hill'], dtype=object)

In [151]:
#Changed all Seattle-based city names to Seattle
data_filtered.loc[data_filtered['location.city'] == 'West Seattle', 'location.city'] = 'Seattle'
data_filtered.loc[data_filtered['location.city'] == 'University Village', 'location.city'] = 'Seattle'
data_filtered.loc[data_filtered['location.city'] == 'unknown', 'location.city'] = 'Seattle'

In [152]:
#Dropped all shops that weren't in Seattle
data_filtered = data_filtered[data_filtered['location.city'] == 'Seattle']

In [159]:
#Count number of words in reviews and add column
data_filtered['num_review_words'] = [len(x.split()) for x in data_filtered['combined_reviews']]

In [178]:
#Filter out places with no reviews. These appear to be closed, not actually coffeeshops, or incorrectly labeled places.
data_with_reviews = data_filtered[data_filtered['num_review_words'] > 0]

In [196]:
data_with_reviews.to_csv('data/seattle_only_shops.csv')

In [221]:
def fill_address_with_google_ids(google_id, api_key):
    '''
    Takes in a google place id and Google API key and outputs address pulled from Google's API.
    '''
    details_url = populate_google_details_url(google_id, api_key)
    details_resp = requests.get(url=details_url)
    details_data = json.loads(details_resp.text)
    try:
        address = details_data['result']['formatted_address'].split(',')[0]
        return address
    except:
        return None

##### Get addresses from Google

In [224]:
data_with_reviews['google_addresses'] = data_with_reviews.apply(lambda row: fill_address_with_google_ids(row['google_ids'],
                                                                                                        google_secrets['key']),
                                                               axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [229]:
data_with_reviews['final_address'] = data_with_reviews['location.address']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [234]:
data_with_reviews.loc[data_with_reviews['location.address'].isnull(),'final_address'] = data_with_reviews['google_addresses']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


##### Get addresses from Yelp

In [278]:
def fill_addresses_with_yelp_ids(yelp_id, api_key):
    '''
    Takes in Yelp id and Yelp API key and outputs location address.
    '''
    yelp_search_url = 'https://api.yelp.com/v3/businesses/{}'.format(yelp_id)
    search_resp = requests.get(url=yelp_search_url, 
                               headers={'Authorization':'Bearer {}'.format(api_key)})
    search_data = json.loads(search_resp.text)
    try:
        return search_data['location']['address1']
    except:
        return None

In [279]:
data_with_reviews['yelp_address'] = data_with_reviews.apply(lambda row: fill_addresses_with_yelp_ids(row['yelp_ids'], yelp_secrets['api_key']), 
                                                            axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [282]:
data_with_reviews.loc[data_with_reviews['final_address'].isnull(),'final_address'] = data_with_reviews['yelp_address']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


#### Drop shops with no addresses

In [286]:
data_with_addresses = data_with_reviews[~data_with_reviews['final_address'].isnull()].copy().

In [292]:
data_with_addresses.columns

Index(['Unnamed: 0', 'id', 'location.address', 'location.city', 'location.lat',
       'location.lng', 'location.postalCode', 'location.state', 'name',
       'stats.tipCount', 'stats.usersCount', 'tips', 'google_ids',
       'google_reviews', 'yelp_ids', 'yelp_reviews', 'tips_clean',
       'google_reviews_clean', 'yelp_reviews_clean', 'combined_reviews',
       'num_review_words', 'google_addresses', 'final_address',
       'yelp_address'],
      dtype='object')

In [297]:
#Remove unnecessary rows for simple tf-idf model
data_less_columns = data_with_addresses.drop(['Unnamed: 0', 'id', 
       'location.address', 'location.city', 'location.postalCode', 'location.state',
       'stats.tipCount', 'stats.usersCount', 'tips', 'google_ids',
       'google_reviews', 'yelp_ids', 'yelp_reviews', 'tips_clean',
       'google_reviews_clean', 'yelp_reviews_clean', 'google_addresses',
       'yelp_address'], axis=1).reset_index().drop('index', axis=1)

In [302]:
data_less_columns.to_csv('data/seattle_only_less_columns.csv')

In [303]:
data_less_columns

Unnamed: 0,location.lat,location.lng,name,combined_reviews,num_review_words,final_address
0,47.578838,-122.411199,Starbucks,absolutely incredible view in my opinion the b...,530,2742 Alki Ave SW
1,47.579130,-122.410511,Alki Cafe,if you cansit in hespers section she gives exc...,369,2726 Alki Ave SW
2,47.579352,-122.409126,Tully's Coffee,in addition to the great coffee theres a pleas...,536,2676 Alki Ave SW
3,47.580447,-122.406728,Ampersand Cafe,craft coffee drinks cider beer wine try the o...,339,2536 Alki Ave SW
4,47.680561,-122.404709,Jibe Espresso Bar,sandwiches and pastries made in house with a v...,362,7001 Seaview Ave NW #170
5,47.675598,-122.398264,The Scoop at Walter's,snoqualmie ice cream decent espresso and appar...,126,6408 32nd Ave NW
6,47.675674,-122.398387,Pico Café,various breads baked inhouse available daily p...,112,6415 32nd Ave NW
7,47.675592,-122.398240,Walter's,i love this place everybody is so nice in the...,196,6408 32nd Ave NW
8,47.659875,-122.398059,Discovery Espresso,roasted pasilla chile breakfast sammy is aweso...,164,3103 W Jameson St
9,47.639812,-122.399856,Starbucks,the power outlets are at the base of the windo...,262,3300 W. McGraw St.


In [321]:
data_not_starbucks = data_less_columns[data_less_columns['name'] != 'Starbucks'].reset_index().drop('index', axis=1)

In [322]:
data_not_starbucks.to_csv('data/data_not_starbucks.csv')

In [94]:
data_not_starbucks = pd.read_csv('data/data_not_starbucks.csv').drop('Unnamed: 0', axis=1)

#### Add google photoids to dataframe

In [22]:
cd website

/Users/ReddingSkinnyRobot/Galvanize/capstone_project/website


In [25]:
from google_api_functions import *

In [27]:
data_not_starbucks['google_image_content'] = data_not_starbucks.apply(lambda row: 
                                                                      get_google_photo(row['location.lat'],
                                                                                       row['location.lng'],
                                                                                       row['name'],
                                                                                       google_secrets['key']),
                                                                      axis=1)

In [39]:
no_image = data_not_starbucks['google_image_content'][4]

In [63]:
import base64

In [88]:
def get_base64(_bytes):
    try:
        return base64.b64encode(_bytes).decode()
    except:
        return None

In [89]:
data_not_starbucks['base64_google_images'] = data_not_starbucks.apply(lambda row: 
                                                                      get_base64(row['google_image_content']),
                                                                      axis=1)

In [93]:
data_not_starbucks.to_csv('data/data_not_starbucks.csv')

In [96]:
testunit = data_not_starbucks.iloc[1]

In [98]:
data_not_starbucks['google_image_id'] = data_not_starbucks.apply(lambda row:
                                                                 get_google_api_photo_id(row['location.lat'],
                                                                                         row['location.lng'],
                                                                                         row['name'],
                                                                                         google_secrets['key']),
                                                                 axis=1)

### Add custom shop ids

In [100]:
data_not_starbucks['shop_id'] = [x for x in range(data_not_starbucks.shape[0])]

In [162]:
output_data = data_not_starbucks.drop(['google_image_content', 'base64_google_images', 'google_image_id'], axis=1)

In [164]:
output_data.to_csv('data/data_not_starbucks.csv')

### Write photos to disk

In [181]:
from PIL import Image
import io

In [205]:
for index, row in data_not_starbucks[~data_not_starbucks['google_image_content'].isnull()].iterrows():
        try:
            image = Image.open(BytesIO(base64.decodebytes(row['base64_google_images'].encode())))
            with open('website/static/shop_images/{}.jpg'.format(row['shop_id']), 'wb') as f:  
                image.save(f)
        except OSError:
            continue

### Getting stats from databases

In [7]:
db = pd.read_csv('data/seattle_coffeeshops_foursquare_google_yelp.csv')

In [10]:
db.columns

Index(['Unnamed: 0', 'id', 'location.address', 'location.city', 'location.lat',
       'location.lng', 'location.postalCode', 'location.state', 'name',
       'stats.tipCount', 'stats.usersCount', 'tips', 'google_ids',
       'google_reviews', 'yelp_ids', 'yelp_reviews'],
      dtype='object')

In [35]:
num_reviews = db.apply(lambda row: (len(row['tips'].replace('"',"'").split("', ")) 
                                   + len(row['google_reviews'].replace('"',"'").split("', "))
                                   + len(row['yelp_reviews'].replace('"',"'").split("', "))),
                       axis=1)

In [37]:
num_reviews.sum() #Num reviews

13765

In [41]:
db_words = pd.read_csv('data/seattle_coffeeshops_combine_reviews.csv')

In [44]:
num_words = db_words.apply(lambda row: len(row['combined_reviews'].split(' ')),
                     axis=1)

In [46]:
num_words.sum() #Num Words

218786

In [49]:
db_words.shape[0] #Num Shops

870

In [54]:
db_seattle_only = pd.read_csv('data/seattle_only_shops.csv')

In [55]:
db_seattle_only.shape[0] #Num seattle only shops

617

In [50]:
db_notS = pd.read_csv('data/data_not_starbucks.csv')

In [53]:
db_notS.shape[0] #Num seattle only not starbucks

474