# Data Scraping 

This is an exersize for data scraping. I have chosen a webpage which provides information on the 'best restaurants in yc'. 

In [1]:
# we will be using these imports
import requests
from bs4 import BeautifulSoup 
import pandas as pd
import lxml.html as lh
import csv 

### Create the Data Pipeline

In [5]:
import os

class restaurant_data_pipeline(object):

    def open_spider(self, spider):
        if not os.path.exists('./data'):
            os.mkdir('./data')
        self.file = open('./data/restaurant_data.csv', 'w')
        
    
    def close_spider(self, spider):
        self.file.close()

    def process_item(self, spider):
        w = csv.DictWriter(self.file,['Type','Name','Price','Link']) 
        w.writeheader()
        for rest in spider.data: 
            w.writerow(rest)            # write the dictionary row to csv
        

In [6]:
import logging

class restaurant_scraper():
   
    def __init__(self):
        self.name = "restaurants"
        self.start_urls = [
        "https://www.cntraveler.com/gallery/best-restaurants-in-new-york-city"    
        ]
        self.data = None
    
    def parse(self):
        for url in self.start_urls:
            page = requests.get(url)
            soup = BeautifulSoup(page.content, "lxml")
            table = soup.find('div', attrs = {'class':'gallery-items-container'}) 

            restaurants = []
            for row in table.findAll('div' , attrs = {'class' : 'image-content-container'}): 
                # build a restaurant dictionary
                restaurant = {} 
                restaurant['Type'] = row.p.text 
                restaurant['Name'] = row.h2.text 
                # we want to get the value of the price 'tag' 
                rating = row.find('div' , attrs = {'class' : 'ratings'}) # we know that it is in ratings
                price = rating.find('p' , attrs = { 'class' : 'price'})  # it is further in price
                # need to take care of the case where there is no price listed
                if price != None:
                    restaurant['Price']  =  price.contents     # 
                else:
                    restaurant["Price"] = None

                restaurant['Link']   = row.a['href']   
                restaurants.append(restaurant)
                
            self.data = restaurants


# perform the scraping

In [7]:
# make the spider object
r = restaurant_scraper()
# parse
r.parse()
# make the pipeline
pl = restaurant_data_pipeline()
pl.open_spider(r)
pl.process_item(r)
pl.close_spider(r)
