In [10]:
from bs4 import BeautifulSoup
import urllib2
import re
import math
import pandas as pd
import numpy as np
import MySQLdb
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

def get_restaurant_info(html_page, base_url):

    # Function to extract restaurant information, such as the name, address, longitude, latitude, and price
    # The input is a single string that represents the first page to be scraped off of menupages
    # The output is a pandas dataframe that contains the data

    request = urllib2.Request(html_page + str(1))
    request.add_header('User-agent', 'Mozilla/5.0 (Linux i686)')

    response = urllib2.urlopen(request)

    soup = BeautifulSoup(response.read())

    num_restaurants = int(soup.find('strong').text)
    num_pages = int(math.ceil(num_restaurants/100.0))

    base_url = base_url

    total_names = []
    total_links = []
    total_prices = []
    total_addresses = []
    total_longitude = []
    total_latitude = []

    for num_page in range(num_pages):
    
        page_request = urllib2.Request(html_page + str(num_page + 1))
        page_request.add_header('User-agent', 'Mozilla/5.0 (Linux i686)')

        page_response = urllib2.urlopen(page_request).read()
        
        page_soup = BeautifulSoup(page_response)

        places = page_soup.find_all(class_ = 'link')

        for the_place in places:
            total_names.append(re.sub('.*</span>|</a>', '', str(the_place)))
            total_links.append(the_place.get('href'))
    
        raw_prices = page_soup.findAll(True, {'class':['price1', 'price2', 'price3', 'price4', 'price5']})

        for the_price in raw_prices:
            total_prices.append(the_price.text)
    
        longitude = re.findall("data\[[0-9]+\]\[\'longitude\'\] = \"(.*)\";", str(page_soup))
        longitude = [float(i) for i in longitude]

        latitude = re.findall("data\[[0-9]+\]\[\'latitude\'\] = \"(.*)\";", str(page_soup))
        latitude = [float(i) for i in latitude]

        address = re.findall("data\[[0-9]+\]\[\'address1\'\] = \"(.*)\";", str(page_soup))

        for index in range(len(places)):
            total_longitude.append(longitude[index])
            total_latitude.append(latitude[index])
            total_addresses.append(address[index])
       
        print num_page
        
    df = pd.DataFrame({ 'name' : total_names,
                        'link' : total_links,
                        'price': total_prices,
                        'address' : total_addresses,
                        'longitude' : total_longitude,
                        'latitude' : total_latitude})
        
    return(df)

In [11]:
base_url = 'http://boston.menupages.com'
restaurant_data = get_restaurant_info("http://boston.menupages.com/restaurants/all-areas/all-neighborhoods/all-cuisines/", base_url)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21


In [12]:
print restaurant_data

                     address   latitude  \
0           182 Brighton Ave  42.353124   
1                56 Beach St  42.351492   
2      757 Massachusetts Ave  42.366397   
3          1369 Cambridge St  42.373697   
4             1309 Beacon St  42.342122   
5            66 Harrison Ave  42.351303   
6             150 Bowdoin St  42.358537   
7              29 Newbury St  42.352369   
8                  89 2nd St  42.368428   
9               40 Dalton St  42.345997   
10              49 Temple Pl  42.355018   
11            419 Harvard St  42.345496   
12        105 Huntington Ave  42.346431   
13            75 Chestnut St  42.356970   
14          220 Northern Ave  42.349137   
15                 50 JFK St  42.372096   
16           174 Harvard Ave  42.351183   
17          433 Cambridge St  42.354754   
18          187 Hampshire St  42.372858   
19        2350 Washington St  42.329436   
20        445 Somerville Ave  42.382193   
21           Location Varies  42.357969   
22         

In [18]:
def get_menu_tags(html_menu, base_url):

    # Function to convert extract tag information from a menupages link
    # The input is a single string (html menu pages)
    # The output is a single string of tags (to be cleaned in R)
    # Tag function added after, incorporate with menu_to_words to make it quicker
    
    menu_request = urllib2.Request(base_url + html_menu + 'menu')
    menu_request.add_header('User-agent', 'Mozilla/5.0 (Linux i686)')
    menu_response = urllib2.urlopen(menu_request).read()
    menu_soup = BeautifulSoup(menu_response)
    tags = re.findall("setTargeting\('cuisine'.*", str(menu_soup))
    features = re.findall("Gluten Free Items", str(menu_soup))
    
    return([tags, features])

In [19]:
tags_and_features = []

for i in range(200):#len(restaurant_data['link'])):
    tags_and_features.append(get_menu_tags(restaurant_data['link'][i], base_url)) 
    print i




0
1
2
3
4


In [20]:
print tags_and_features

[[["setTargeting('cuisine', ['chinese', 'japanese', 'sushi']);\r"], []], [["setTargeting('cuisine', ['bakeries', 'desserts']);\r"], []], [["setTargeting('cuisine', ['cafes-coffeehouses']);\r"], []], [["setTargeting('cuisine', ['cafes-coffeehouses']);\r"], []], [["setTargeting('cuisine', ['desserts', 'kosher']);\r"], []]]


In [22]:
restaurant_tags = [item[0] for item in tags_and_features]
print restaurant_tags

[["setTargeting('cuisine', ['chinese', 'japanese', 'sushi']);\r"], ["setTargeting('cuisine', ['bakeries', 'desserts']);\r"], ["setTargeting('cuisine', ['cafes-coffeehouses']);\r"], ["setTargeting('cuisine', ['cafes-coffeehouses']);\r"], ["setTargeting('cuisine', ['desserts', 'kosher']);\r"]]


In [24]:
restaurant_features = [item[1] for item in tags_and_features]
print restaurant_features

[[], [], [], [], []]


In [25]:
clean_features = [1 if 'Gluten Free Items' in x else 0 for x in restaurant_features]

In [26]:
print clean_features

[0, 0, 0, 0, 0]
