# 시카고 샌드위치 맛집 분석

## 1. 시카고 샌드위치 맛집 사이트에 접근하기

In [1]:
from bs4 import BeautifulSoup 
from urllib.request import urlopen

In [2]:
url_base = 'http://www.chicagomag.com'
url_sub = '/Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/'
url = url_base + url_sub

html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')

In [3]:
rest_list = soup.find_all(class_='sammy')
rest_list[0]

<div class="sammy" style="position: relative;">
<div class="sammyRank">1</div>
<div class="sammyListing"><a href="/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/"><b>BLT</b><br>
Old Oak Tap<br>
<em>Read more</em> </br></br></a></div>
</div>

In [4]:
len(rest_list)

50

## 2. 접근한 웹 페이지에서 원하는 데이터 추출하고 정리하기

In [5]:
tmp_one = rest_list[0]

In [6]:
tmp_one.find(class_="sammyRank")

<div class="sammyRank">1</div>

In [7]:
tmp_one.find(class_="sammyRank").get_text()

'1'

In [8]:
tmp_one.find(class_="sammyListing").get_text()

'BLT\r\nOld Oak Tap\nRead more '

In [9]:
tmp_one.find('a')['href']

'/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/'

In [10]:
tmp_str = tmp_one.find(class_="sammyListing").get_text()
tmp_str.split('\n')

['BLT\r', 'Old Oak Tap', 'Read more ']

In [11]:
tmp_res = tmp_str.split('\n')
menu = tmp_res[0].replace('\r', '')
menu

'BLT'

In [12]:
cafe = tmp_res[1]
cafe

'Old Oak Tap'

In [13]:
# Regular Expression
import re

In [14]:
re.split(('\n|\r\n'), tmp_str)

['BLT', 'Old Oak Tap', 'Read more ']

In [15]:
print(re.split(('\n|\r\n'), tmp_str)[0])
print(re.split(('\n|\r\n'), tmp_str)[1])

BLT
Old Oak Tap


In [16]:
from urllib.parse import urljoin

In [17]:
## 데이터 프레임 만들기 전에 항목 데이터 구하기
rank = []
main_menu = []
cafe_name = []
url_add = []

for rest in rest_list:
    rank.append(int(rest.find(class_="sammyRank").get_text()))
    tmp_str = rest.find(class_="sammyListing").get_text()
    main_menu.append(re.split(('\n|\r\n'), tmp_str)[0])
    cafe_name.append(re.split(('\n|\r\n'), tmp_str)[1])
    url_add.append(urljoin(url_base, rest.find('a')['href']))

In [18]:
rank[:5]

[1, 2, 3, 4, 5]

In [19]:
main_menu[:5]

['BLT', 'Fried Bologna', 'Woodland Mushroom', 'Roast Beef', 'PB&L']

In [20]:
cafe_name[:5]

['Old Oak Tap', 'Au Cheval', 'Xoco', 'Al’s Deli', 'Publican Quality Meats']

In [21]:
url_add

['http://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/',
 'http://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Au-Cheval-Fried-Bologna/',
 'http://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Xoco-Woodland-Mushroom/',
 'http://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Als-Deli-Roast-Beef/',
 'http://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Publican-Quality-Meats-PB-L/',
 'http://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Hendrickx-Belgian-Bread-Crafter-Belgian-Chicken-Curry-Salad/',
 'http://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Acadia-Lobster-Roll/',
 'http://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Birchwood-Kitchen-Smoked-Salmon-Salad/',
 'http://www.chicagomag.com/Chicago-Magazine/November-2012/

In [23]:
# 데이터 프레임으로 만들기
import pandas as pd
df = pd.DataFrame({'rank': rank, 'cafe': cafe_name, 'menu': main_menu, 'URL': url_add})
df.head()

Unnamed: 0,rank,cafe,menu,URL
0,1,Old Oak Tap,BLT,http://www.chicagomag.com/Chicago-Magazine/Nov...
1,2,Au Cheval,Fried Bologna,http://www.chicagomag.com/Chicago-Magazine/Nov...
2,3,Xoco,Woodland Mushroom,http://www.chicagomag.com/Chicago-Magazine/Nov...
3,4,Al’s Deli,Roast Beef,http://www.chicagomag.com/Chicago-Magazine/Nov...
4,5,Publican Quality Meats,PB&L,http://www.chicagomag.com/Chicago-Magazine/Nov...


In [24]:
df.to_csv('chicago1.csv', sep=',', encoding="UTF-8")

## 3. 다수의 웹 페이지에 자동으로 접근해서 원하는 정보 가져오기

In [26]:
df['URL'][0]

'http://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/'

In [28]:
html = urlopen(df['URL'][0])
soup_tmp = BeautifulSoup(html, 'html.parser')

In [29]:
soup_tmp.find(class_='addy')

<p class="addy">
<em>$10. 2109 W. Chicago Ave., 773-772-0406, <a href="http://www.theoldoaktap.com/">theoldoaktap.com</a></em></p>

In [30]:
tmp_str = soup_tmp.find(class_='addy').get_text()
tmp_str

'\n$10. 2109 W. Chicago Ave., 773-772-0406, theoldoaktap.com'

In [31]:
tmp_str.split()

['$10.', '2109', 'W.', 'Chicago', 'Ave.,', '773-772-0406,', 'theoldoaktap.com']

In [33]:
tmp_price = tmp_str.split()[0][:-1]
tmp_price

'$10'

In [34]:
tmp_str.split()[1:-2]

['2109', 'W.', 'Chicago', 'Ave.,']

In [35]:
' '.join(tmp_str.split()[1:-2])

'2109 W. Chicago Ave.,'

In [36]:
tmp_addr = ' '.join(tmp_str.split()[1:-2])[:-1]
tmp_addr

'2109 W. Chicago Ave.'

### - 상태 진행바 적용

In [37]:
from tqdm import tqdm_notebook 
import time 

In [38]:
price = []
addr = []
for n in tqdm_notebook(df.index):
    html = urlopen(df['URL'][n])
    soup_tmp = BeautifulSoup(html, 'html.parser')
    tmp_str = soup_tmp.find(class_='addy').get_text()
    
    price.append(tmp_str.split()[0][:-1])
    addr.append(' '.join(tmp_str.split()[1:-2])[:-1])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [39]:
price[:10]

['$10', '$9', '$9.50', '$9.40', '$10', '$7.25', '$16', '$10', '$9', '$17']

In [40]:
addr[:5]

['2109 W. Chicago Ave.',
 '800 W. Randolph St.',
 '445 N. Clark St.',
 '914 Noyes St., Evanston',
 '825 W. Fulton Mkt.']

In [41]:
df.head()

Unnamed: 0,rank,cafe,menu,URL
0,1,Old Oak Tap,BLT,http://www.chicagomag.com/Chicago-Magazine/Nov...
1,2,Au Cheval,Fried Bologna,http://www.chicagomag.com/Chicago-Magazine/Nov...
2,3,Xoco,Woodland Mushroom,http://www.chicagomag.com/Chicago-Magazine/Nov...
3,4,Al’s Deli,Roast Beef,http://www.chicagomag.com/Chicago-Magazine/Nov...
4,5,Publican Quality Meats,PB&L,http://www.chicagomag.com/Chicago-Magazine/Nov...


In [42]:
del df['URL']
df['price'] = price
df['address'] = addr
df.head()

Unnamed: 0,rank,cafe,menu,price,address
0,1,Old Oak Tap,BLT,$10,2109 W. Chicago Ave.
1,2,Au Cheval,Fried Bologna,$9,800 W. Randolph St.
2,3,Xoco,Woodland Mushroom,$9.50,445 N. Clark St.
3,4,Al’s Deli,Roast Beef,$9.40,"914 Noyes St., Evanston"
4,5,Publican Quality Meats,PB&L,$10,825 W. Fulton Mkt.


In [43]:
df.set_index('rank', inplace=True)
df.head()

Unnamed: 0_level_0,cafe,menu,price,address
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Old Oak Tap,BLT,$10,2109 W. Chicago Ave.
2,Au Cheval,Fried Bologna,$9,800 W. Randolph St.
3,Xoco,Woodland Mushroom,$9.50,445 N. Clark St.
4,Al’s Deli,Roast Beef,$9.40,"914 Noyes St., Evanston"
5,Publican Quality Meats,PB&L,$10,825 W. Fulton Mkt.


In [44]:
df.to_csv('chicago2.csv', sep=',', encoding="UTF-8")

## 3. 맛집 위치를 지도에 표기하기

In [66]:
import numpy as np
import folium
import googlemaps

In [70]:
key_fd = open('googlemapskey.txt', mode='r')
gmaps_key = key_fd.read(100)
key_fd.close()

In [71]:
gmaps = googlemaps.Client(key=gmaps_key)

In [72]:
lat = []
lng = []

for n in tqdm_notebook(df.index):
    if df['address'][n] != 'Multiple':
        target_name = df['address'][n]+', '+'Chicago'
        gmaps_output = gmaps.geocode(target_name)
        location_output = gmaps_output[0].get('geometry')
        lat.append(location_output['location']['lat'])
        lng.append(location_output['location']['lng'])
    else:
        lat.append(np.nan)
        lng.append(np.nan)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [73]:
df['lat'] = lat
df['lng'] = lng
df.head()

Unnamed: 0_level_0,cafe,menu,price,address,lat,lng
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Old Oak Tap,BLT,$10,2109 W. Chicago Ave.,41.895605,-87.679961
2,Au Cheval,Fried Bologna,$9,800 W. Randolph St.,41.884658,-87.647667
3,Xoco,Woodland Mushroom,$9.50,445 N. Clark St.,41.890523,-87.630783
4,Al’s Deli,Roast Beef,$9.40,"914 Noyes St., Evanston",42.058322,-87.683748
5,Publican Quality Meats,PB&L,$10,825 W. Fulton Mkt.,41.8866,-87.648451


In [74]:
mapping = folium.Map(location=[df['lat'].mean(), df['lng'].mean()], 
                                      zoom_start=11)
folium.Marker([df['lat'].mean(), df['lng'].mean()], 
                                      popup='center').add_to(mapping)
mapping

In [77]:
mapping = folium.Map(location=[df['lat'].mean(), df['lng'].mean()], 
                     zoom_start=11)

for n in df.index:
    if df['address'][n] != 'Multiple':
        folium.Marker([df['lat'][n], df['lng'][n]], 
                                      popup=df['cafe'][n]).add_to(mapping)

mapping

In [78]:
df.to_csv('chicago3.csv', sep=',', encoding="UTF-8")