In [493]:
import csv
import requests
import os
import numpy as np
import pandas as pd 
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pickle
from time import sleep 
from random import randint 
import random 
import json 
import re

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', None)

### Data Obtaining Part 1: Defining States and Capitals

Specifically choose 3 states from each region of the USA. Priority on BBQ states and population count 
Division 1: Massachusetts, Connecticut, Rhode Island
Division 2: New Jersey, New York, Pennsylvania 
Division 3: Illinois, Ohio, Indiana
Division 4: Kansas, Missouri, Nebraska
Division 5: North Carolina, South Carolina, Florida
Division 6: Alabama, Tennessee, Kentucky
Division 7: Texas, Louisiana (50), Oklahoma (27) 
Division 8: Nevada (28), Arizona (5), Colorado (19)
Division 9: California, Hawaii, Washington

https://en.wikipedia.org/wiki/List_of_regions_of_the_United_States
https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population
https://en.wikipedia.org/wiki/Barbecue_in_the_United_States#Kansas_City
https://en.wikipedia.org/wiki/File:Census_Regions_and_Division_of_the_United_States.svg

In [863]:
state_capital = pd.read_csv("us-state-capitals.csv")
state_capital.rename(columns={"description":"capital"}, inplace=True)
state_capital.head()

state_abbrev = pd.read_csv("state_abbrev.csv")

In [864]:
state_abbrev.head()

Unnamed: 0,State,Abbrev,Code
0,Alabama,Ala.,AL
1,Alaska,Alaska,AK
2,Arizona,Ariz.,AZ
3,Arkansas,Ark.,AR
4,California,Calif.,CA


In [865]:
### Create State Capital Panda DF. Contains State/Capital/Division 
state_capital_df = pd.DataFrame(columns=['State', 'Capital', 'Division'] )

### List of states per division 
div_1 = ["Massachusetts", "Connecticut", "Rhode Island"]
div_2 = ["New Jersey", "New York", "Pennsylvania"]
div_3 = ["Illinois", "Ohio", "Indiana"]
div_4 = ["Kansas", "Missouri", "Nebraska"]
div_5 = ["North Carolina", "South Carolina", "Florida"]
div_6 = ["Alabama", "Tennessee", "Kentucky"]
div_7 = ["Texas", "Louisiana", "Oklahoma"]
div_8 = ["Nevada", "Arizona", "Colorado"]
div_9 = ["California", "Hawaii", "Washington"]

combined_div = [ div_1, div_2, div_3, div_4, div_5, div_6, div_7, div_8, div_9]

### Creating each row 
for div in enumerate(combined_div) : 
    for state in div[1] : 
        result = state_capital.loc[ state_capital["name"] == state ]
        
        if result.empty : 
            print("Failed")
            break 
        
        
        state_capital_df = state_capital_df.append({'State': state_abbrev.loc[state_abbrev["State"] == state].Code.values[0],
                                                    'Capital': result.capital.values[0],
                                                    'Division': div[0] + 1}, ignore_index=True)
    if result.empty : 
        break 

print(state_capital_df.shape)
state_capital_df.head()

(27, 3)


Unnamed: 0,State,Capital,Division
0,MA,Boston,1
1,CT,Hartford,1
2,RI,Providence,1
3,NJ,Trenton,2
4,NY,Albany,2


In [866]:
state_capital_df.to_pickle("state_div.pkl")

### Data Obtaining Part 2: Create GET Request on Foursquare to Obtain Restaurants

In [160]:
### Create Main Restaurant Panda DF 
### Table Entries: ID, Name, Lat/Long, City/State, Category Name, Rating, Number of Rating
### Menu, Pricing,  

restaurant_df = pd.DataFrame(columns=['ID', 'Name', 'Latitude', "Longitude", "State", "City", "Division", "Category Name", "Rating", "Number of Rating"] )

In [120]:
### Foursquare Client ID/Secret and Version
CLIENT_ID = 'TTBNZVFHMNPROOW2YUFMMDCLG1RWVHJURIQEJG5JH2ZG0510' 
CLIENT_SECRET = '4FVF4FFBMZUPRFXCAHLUVYFQFNDOVLR3HNUXIGZX1ECR3KVA'
VERSION = '20191230'

### Category ID matches venues having BBQ Restaurant as their category. 
categoryId= "4bf58dd8d48988d1df931735"
LIMIT = "300"
radius = 50000 

### Function to Retrieve Data and Create Data Frame 
def restaurant_retrieve (states, cities, divisions, rest_info, id_track ) : 
    
    for state, city, div in zip(states, cities, divisions) : 
        ### Create URL
        url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&categoryId={}&near={},{}&limit={}&radius={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION,
            categoryId, 
            city, 
            state,  
            LIMIT, 
            radius)

        ### Make GET Request
        results = requests.get(url).json()
        
        ### Append to DF
        for v in results["response"]["venues"]:
            
            if v["id"] not in id_track and v["location"]["state"] == state : 
                rest_info.append([v["id"], v["name"], v["location"]["lat"], v["location"]["lng"], 
                      v["location"]["state"], v["location"]["city"], 1, v["categories"][0]["name"], 
                      0, 0])
                id_track.append(v["id"])

            
        
    return rest_info, id_track 

### Function to Retrieve information on Restaurants 
def restaurant_info_retrieve (ID) : 
    
    url = 'https://api.foursquare.com/v2/venues/{}/menu?&client_id={}&client_secret={}&v={}'.format(
            ID, 
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION
            )
    
    results = requests.get(url).json() 
    
    items = []
    price = []
    
    print(results["response"])
    
    if results["response"] == {} : 
        pass 
    
    elif results["response"]["menu"]["menus"]["count"] == 0 : 
        pass 
    else : 
        for i in results["response"]["menu"]["menus"]["items"][0]["entries"]["items"] : 
            for item_price in i["entries"]["items"] :
                
                if "prices" not in item_price.keys() : 
                    items.append(item_price["name"])
                    price.append("Unknown")   
                    
                elif item_price["prices"] != [] : 
                    items.append(item_price["name"])
                    price.append(item_price["prices"][0]) 
                
    return items, price, url 
    

In [None]:
### Create empty list variables 
rest_info = []
id_list = []

In [None]:
### Method 1: Obtaining Restaurant Menu and Price via FourSquare (Unable to get more than 50 due to Premium calls)
restaurant_menu_df = pd.DataFrame(columns=['Restaurant', 'Menu_Item', 'Price', 'Link'] )

for i in range(restaurant_df.size) : 
    ID = restaurant_df.iloc[i].ID
    Name = restaurant_df.iloc[i].Name
    
    if ID not in id_list : 
        print(i)
        items, price, url = restaurant_info_retrieve(ID) 
        rest_info.append([Name, items, price, url,])
        id_list.append(ID)


In [395]:
### Saves the current restaurant info and IDs 
pickle.dump( id_list, open( "id_list.p", "wb" ) )
pickle.dump( rest_info, open( "rest_info.p", "wb" ) )

In [392]:
### Loads the current restaurant info and IDs 
id_list = pickle.load(open("id_list.p", "rb"))
rest_info = pickle.load(open("rest_info.p", "rb"))

Table Entries
ID, Name, Lat/Long, City/State, Category Name, 
Menu, Pricing, Rating, Number of Rating 

**Menu and Pricing will probably need to be in a separate table as well 
**Rating will show how well the restaurant is performing 
**Number of rating will show how many people actively rated this restaurant

In [214]:
### Call Function 
rest_info = []
id_track = [] 

rest_info, id_track = restaurant_retrieve (state_capital_df["State"], state_capital_df["Capital"], 
                                           state_capital_df["Division"], rest_info, id_track )


In [215]:
### Convert to DF 
restaurant_df = pd.DataFrame(rest_info)
restaurant_df.columns = ['ID', 'Name', 'Latitude', "Longitude", 
                         'State', "City", "Division", "Category Name", 
                         "Rating", "Number of Rating"]

In [233]:
### Pickle Save Files 
restaurant_df.to_pickle("restaurant_df_pickle.pkl")
pickle.dump( id_track, open( "id_track.p", "wb" ) )

In [459]:
### Using BS4 to obtain remaining information from website 
from itertools import cycle


### Header Detail
headers = { 'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
user_agent_list = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]

# link = "https://foursquare.com/v/terry-blacks-bbq/533efc4d498e3775c831b2ee/menu"
# proxies = get_proxies()
# proxy_pool = cycle(proxies)

restaurant_menu_df = pd.DataFrame(columns=['Restaurant', 'Menu_Item', 'Price', 'Link', "Ratings", "Ratings_Number"] )

count = 0 

for i in range(46) : 
    ID = restaurant_df.iloc[i].ID
    Name = restaurant_df.iloc[i].Name 
    link = "https://foursquare.com/v/{}/{}/menu".format( Name.replace(" ", "-").lower(), ID )
    print("Restaurant Number:", i)
    
    with requests.Session() as s :
        sleep(randint(1,7))
        headers['user-agent'] = random.choice(user_agent_list) 
        
        r = s.get(link, headers=headers )
        soup = BeautifulSoup(r.content, 'html.parser')
        
        pot_error = soup.find_all( id = "container")
        error = pot_error[0].text.replace("\n", "")

        if "We couldn't find the page you're looking for" in error :
            restaurant_menu_df = restaurant_menu_df.append({'Restaurant': Name ,
                                                            'Menu_Item' : "Empty",
                                                            'Price' : "Empty",
                                                            'Link': link,
                                                            'Ratings': "Empty",
                                                            'Ratings_Number': "Empty"}, ignore_index=True)
                                                            
        else : 
            rate_value_search = soup.find_all( itemprop = "ratingValue" )
            rate_num_search = soup.find_all( itemprop = "ratingCount")
            menu_item_price = soup.find_all(class_ = ["menuHeader", "entryPrice"] )
            
            menu_item = [i.text for i in menu_item_price[0:len(menu_item_price):2] ]
            price = [i.text for i in menu_item_price[1:len(menu_item_price):2] ] 
            
            if menu_item[0] == "Please Contact the Restaurant Directly" : 
                menu_item = [""]
                price = [""]        
                
            if rate_value_search == [] or rate_num_search == [] : 
                rate_value = -1 
                rate_num = -1 
                
            else : 
                rate_value = rate_value_search[0].text
                rate_num = rate_num_search[0].text 
                
            restaurant_menu_df = restaurant_menu_df.append({'Restaurant': Name ,
                                                    'Menu_Item' : menu_item,
                                                    'Price' : price, 
                                                    'Link': link,
                                                    'Ratings': rate_value,
                                                    'Ratings_Number': rate_num}, ignore_index=True)
        
    
    
    

Restaurant Number: 0


IndexError: list index out of range

In [446]:
### Loading in relevant information through accessing each page file text. 
### Information pertains to pricing scale, rating, and number of rating

restaurant_pricing = [] 
restaurant_rating = [] 
restaurant_rating_num = [] 

for i in range(restaurant_df.shape[0]) : 
    ### Load File
    test = pickle.load(open('Restaurant_Info_Text_Pickle/Text_Dump_%d.p' %(i+1) , 'rb') ) 
    
    ### Find Price Rating 
    first_index = test.find("price")
    last_index = test.find("}", first_index)

    first_test_str = test[first_index:last_index+1]
    first_index = first_test_str.find("message") 
    last_index = first_test_str.find("currency", first_index)
    
    price_tier = first_test_str[first_index+11:last_index-3] 
    
    if price_tier == "" : 
        price_tier = "Not Available"
    
    ### Find Rating and Number of Rating  
    first_index = test.find("rating")
    last_index = test.find("ratingColor", first_index)
    rating_number = test[first_index+9:last_index-2]

    first_index = test.find("ratingSignals")
    last_index = test.find("delivery", first_index)
    num_sample = test[first_index+16:first_index+20]
    
    if first_index == -1 : 
        rating_number = -1 
        num_sample = 0
    
    else : 
        num_sample = num_sample.split()[0]
    
    restaurant_pricing.append(price_tier)
    restaurant_rating.append( float(rating_number) ) 
    restaurant_rating_num.append( int(num_sample) ) 
    

In [448]:
### Load in Data into DF 
restaurant_df["Price Tier"] = restaurant_pricing 
restaurant_df["Rating"] = restaurant_rating 
restaurant_df["Number of Rating"] = restaurant_rating_num

In [483]:
### Store premimum calls into temporary arrays 
restaurant_menu = [ i[1] for i in rest_info ]
restaurant_menu_price = [ i[2] for i in rest_info ]

In [840]:
### Function Call for Parsing though Menu text 

def menu_item_parse( rest_text) : 
    
    ### Find all names/prices occurence in the text
    name_pos = [m.start() for m in re.finditer('\"name\"', rest_text)]
    price_pos = [m.start() for m in re.finditer('\"prices\"', rest_text)]

    price_counter = 0 
    
    menu_item = [] 
    menu_item_price = [] 

    for i in name_pos : 
        ### Determine if the given name tag string has a price tag below it 
        new_line_pose = [m.start() for m in re.finditer('\n', rest_text[i:]) ] 

        result = [ 1 for j in range(0,2) if '\"prices\"' in rest_text[i+new_line_pose[j]:i+new_line_pose[j]+9].strip() ]
        
        if result != [] : 
            ### Obtain Prices specifically by searching start/end brackets and convert to dictionary 
            prices_start = price_pos[price_counter]
            price_dict_start = rest_text[prices_start:].find("[") + prices_start 
            price_dict_end = rest_text[price_dict_start:].find("]") + price_dict_start
            price_dict = rest_text[price_dict_start: price_dict_end+1]
            price_dict = json.loads(price_dict.replace("[", "{").replace("]", "}") ) 
            
            end_quote = rest_text[i+8:].find('\"')
            
            menu_item.append(rest_text[i+8:i+8+end_quote])
            menu_item_price.append(price_dict)
            price_counter += 1   
    
    return menu_item, menu_item_price
    

In [841]:
### Compiling all restaurant items and prices 
full_menu_item = []
full_menu_price = [] 
testing = [] 

for i in range(46,restaurant_df.shape[0]) : 
    ### Load File
    print(i)
    test = pickle.load(open('Restaurant_Text_Pickle/Text_Dump_%d.p' %(i) , 'rb') ) 
    menu_item, menu_item_price = menu_item_parse(test)
    
    testing.append(test)
    full_menu_item.append(menu_item)
    full_menu_price.append(menu_item_price) 

46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
30

In [849]:
final_menu = restaurant_menu + full_menu_item
final_menu_price = restaurant_menu_price + full_menu_price 

In [854]:
restaurant_df["Menu Item"] = final_menu 
restaurant_df["Menu Item Price"] = final_menu_price 

In [855]:
restaurant_df.head()

Unnamed: 0,ID,Name,Latitude,Longitude,State,City,Division,Category Name,Rating,Number of Rating,Price Tier,Menu Item,Menu Item Price
0,534f07e9498e5cc70137182b,The Causeway Restaurant and Pub,42.364659,-71.062912,MA,Boston,1,BBQ Joint,7.0,55,Moderate,[],[]
1,593d9fb5c876c8327eef128d,Rusty Can,42.755437,-70.938839,MA,Byfield,1,BBQ Joint,8.0,14,Moderate,[],[]
2,4ba2b7caf964a520211338e3,Joff's Backyard Grill,42.084574,-71.471883,MA,Bellingham,1,BBQ Joint,7.9,18,Moderate,[],[]
3,5c5efed8419a9e002ce8ea9c,The Smoke Shop BBQ - Assembly Row,42.392249,-71.07818,MA,Somerville,1,Restaurant,-1.0,0,Moderate,"[17Th Street Soul Rolls, Hot Links & Pimento C...","[9.00, 7.50, 9.00, 10.00, 12.00, 9.00, 9.00, 8..."
4,5ca25d74dd12f8002c74364b,Flip The Bird,42.55927,-70.88162,MA,Beverly,1,BBQ Joint,-1.0,0,Moderate,[],[]


In [858]:
### Final Dump and Save DF 
restaurant_df.to_pickle("restaurant_df_pickle.pkl")

In [860]:
### Load Table up 
restaurant_df = pd.read_pickle("restaurant_df_pickle.pkl")
restaurant_df.head()

Unnamed: 0,ID,Name,Latitude,Longitude,State,City,Division,Category Name,Rating,Number of Rating,Price Tier,Menu Item,Menu Item Price
0,534f07e9498e5cc70137182b,The Causeway Restaurant and Pub,42.364659,-71.062912,MA,Boston,1,BBQ Joint,7.0,55,Moderate,[],[]
1,593d9fb5c876c8327eef128d,Rusty Can,42.755437,-70.938839,MA,Byfield,1,BBQ Joint,8.0,14,Moderate,[],[]
2,4ba2b7caf964a520211338e3,Joff's Backyard Grill,42.084574,-71.471883,MA,Bellingham,1,BBQ Joint,7.9,18,Moderate,[],[]
3,5c5efed8419a9e002ce8ea9c,The Smoke Shop BBQ - Assembly Row,42.392249,-71.07818,MA,Somerville,1,Restaurant,-1.0,0,Moderate,"[17Th Street Soul Rolls, Hot Links & Pimento C...","[9.00, 7.50, 9.00, 10.00, 12.00, 9.00, 9.00, 8..."
4,5ca25d74dd12f8002c74364b,Flip The Bird,42.55927,-70.88162,MA,Beverly,1,BBQ Joint,-1.0,0,Moderate,[],[]
