# 시카고 샌드위치 맛집 분석

### 1. 시카고 샌드위치 맛집 사이트에 접근하기

In [2]:
from bs4 import BeautifulSoup 
from urllib.request import urlopen

In [3]:
url_base = 'http://www.chicagomag.com'
url_sub = '/Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/'
url = url_base + url_sub

html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')

In [5]:
rest_list = soup.select('.sammy')
rest_list[0]

<div class="sammy" style="position: relative;">
<div class="sammyRank">1</div>
<div class="sammyListing"><a href="/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/"><b>BLT</b><br>
Old Oak Tap<br>
<em>Read more</em> </br></br></a></div>
</div>

In [6]:
len(rest_list)

50

### 2. 접근한 웹 페이지에서 원하는 데이터 추출하고 정리하기

In [7]:
rank = rest_list[0].select_one('.sammyRank').string
rank

'1'

In [22]:
from urllib.parse import urljoin
url = urljoin(url_base, rest_list[10].find('a')['href'])
url

'http://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Lula-Cafe-Ham-and-Raclette-Panino/'

In [10]:
tmp = rest_list[0].find('a').get_text()
tmp

'BLT\r\nOld Oak Tap\nRead more '

In [11]:
tmp.split('\n')

['BLT\r', 'Old Oak Tap', 'Read more ']

In [13]:
menu = tmp.split('\n')[0].replace('\r', '')
cafe = tmp.split('\n')[1]
menu, cafe

('BLT', 'Old Oak Tap')

In [26]:
rank_list = []; url_list = []; menu_list = []; cafe_list = []
for rest in rest_list:
    rank = int(rest.select_one('.sammyRank').string)
    url = urljoin(url_base, rest.find('a')['href'])
    tmp = rest.find('a').get_text().split('\n')
    menu = tmp[0].replace('\r', '')
    cafe = tmp[1]
    rank_list.append(rank)
    url_list.append(url)
    menu_list.append(menu)
    cafe_list.append(cafe)

In [27]:
import pandas as pd 
df = pd.DataFrame({
    'Rank': rank_list,
    'Cafe': cafe_list,
    'Menu': menu_list,
    'URL': url_list
})
df.head()

Unnamed: 0,Rank,Cafe,Menu,URL
0,1,Old Oak Tap,BLT,http://www.chicagomag.com/Chicago-Magazine/Nov...
1,2,Au Cheval,Fried Bologna,http://www.chicagomag.com/Chicago-Magazine/Nov...
2,3,Xoco,Woodland Mushroom,http://www.chicagomag.com/Chicago-Magazine/Nov...
3,4,Al’s Deli,Roast Beef,http://www.chicagomag.com/Chicago-Magazine/Nov...
4,5,Publican Quality Meats,PB&L,http://www.chicagomag.com/Chicago-Magazine/Nov...


In [28]:
df.to_csv('chicago.csv', sep=',', encoding='utf8')

### 3. 다수의 웹 페이지에 자동으로 접근해서 원하는 정보 가져오기

In [29]:
html = urlopen(df['URL'][0])
soup_tmp = BeautifulSoup(html, 'html.parser')

In [31]:
soup_tmp.select_one('.addy')

<p class="addy">
<em>$10. 2109 W. Chicago Ave., 773-772-0406, <a href="http://www.theoldoaktap.com/">theoldoaktap.com</a></em></p>

In [33]:
soup_tmp.select_one('.addy').get_text()

'\n$10. 2109 W. Chicago Ave., 773-772-0406, theoldoaktap.com'

In [44]:
tmp = soup_tmp.select_one('.addy').get_text().split(',')
len(tmp)

3

In [45]:
tmp[0]

'\n$6.85. 3351 N. Broadway'

In [52]:
price = tmp[0].split()[0][:-1]
price

'$6.85'

In [51]:
addr = ' '.join(tmp[0].split()[1:])
addr

'3351 N. Broadway'

In [50]:
tmp[1].strip()

'773-868-4000'

In [40]:
from tqdm import tqdm_notebook

In [59]:
rank_list = []; url_list = []; menu_list = []; cafe_list = []
price_list = []; addr_list = []; tel_list = []; hp_list = []
for rest in tqdm_notebook(rest_list):
    rank = int(rest.select_one('.sammyRank').string)
    url = urljoin(url_base, rest.find('a')['href'])
    tmp = rest.find('a').get_text().split('\n')
    menu = tmp[0].replace('\r', '')
    cafe = tmp[1]
    rank_list.append(rank)
    url_list.append(url)
    menu_list.append(menu)
    cafe_list.append(cafe)

    html = urlopen(url)
    soup_tmp = BeautifulSoup(html, 'html.parser')
    tmp = soup_tmp.select_one('.addy').get_text().split(',')
    price = tmp[0].split()[0][:-1]
    price_list.append(price)
    addr = ' '.join(tmp[0].split()[1:])
    if len(tmp) == 1:
        addr_list.append(addr)
        tel_list.append(' ')
        hp_list.append(' ')
    elif len(tmp) == 2:
        if addr.find('Multiple') >= 0:
            addr_list.append(' ')
            tel_list.append(' ')
            hp_list.append(tmp[1])
        else:
            addr_list.append(addr)
            tel_list.append(tmp[1].strip())
            hp_list.append(' ')
    elif len(tmp) == 3:
        addr_list.append(addr)
        tel_list.append(tmp[1].strip())
        hp_list.append(tmp[2])
    elif len(tmp) == 4:
        addr += ', ' + tmp[1]
        addr_list.append(addr)
        tel_list.append(tmp[2].strip())
        hp_list.append(tmp[3])

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [60]:
df2 = pd.DataFrame({
    'Rank': rank_list,
    'Cafe': cafe_list,
    'Menu': menu_list,
    'Price': price_list,
    'Address': addr_list,
    'Telephone': tel_list,
    'Home Page': hp_list
})
df2.head()

Unnamed: 0,Rank,Cafe,Menu,Price,Address,Telephone,Home Page
0,1,Old Oak Tap,BLT,$10,2109 W. Chicago Ave.,773-772-0406,theoldoaktap.com
1,2,Au Cheval,Fried Bologna,$9,800 W. Randolph St.,312-929-4580,aucheval.tumblr.com
2,3,Xoco,Woodland Mushroom,$9.50,445 N. Clark St.,312-334-3688,rickbayless.com
3,4,Al’s Deli,Roast Beef,$9.40,"914 Noyes St., Evanston",847-475-9400,alsdeli.net
4,5,Publican Quality Meats,PB&L,$10,825 W. Fulton Mkt.,312-445-8977,publicanqualitymeats.com


In [61]:
df2.to_csv('chicago2.csv', sep=',', encoding='utf8')