# Webscraping Project - Deepak Khirey
## Notebook 1 - Webscraping using Beautifulsoup
This notebook contains code to scrape Subarban Ford website "https://www.suburbanford.com/new-inventory/index.htm". This code is executed daily once to get car inventory data from website.

In [1]:
from urllib.parse import urlparse
from pprint import pprint
from urllib.parse import urlparse, parse_qs, parse_qsl
from urllib.request import urlopen
from bs4 import BeautifulSoup
from urllib.robotparser import RobotFileParser
import pandas as pd
from datetime import date
import os
import time

In [2]:
def getpagedata(url):
    # Requesting data from the website
    try:
        response = urlopen(url)
        page_content = response.read()
        bs = BeautifulSoup(page_content, 'lxml')
        return bs
        #print(bs.prettify())
    except Exception as e:
        print(e)

In [3]:
def scrapeallowed(url):
    # URL to Python's robots.txt files
    python_robot_url = url+ r'/robots.txt'
    #r'https://www.autotrader.com/robots.txt'

    # User-Agent
    user_agent = r'*' # '*' refers to all other bots

    # Create a RobotFileParser object
    # Pass Python's URL while creating the object
    python_robot_parser = RobotFileParser(python_robot_url)
    python_robot_parser.read()

    print('The robots file read is at:', python_robot_parser.url)

    valid_request = python_robot_parser.can_fetch(user_agent, url)
    print('The requested URL', url, 'can be crawled:', valid_request)
    return valid_request

In [4]:
def getclassvalue(tag,classname):
    value = None if tag.find(class_=classname) is None else tag.find(class_=classname).getText()
    return value

In [5]:
def getdetails(bs):
    header_class_elems = bs.find_all(class_='hproduct auto ford') 
    #pprint(header_class_elems)  
    for element in header_class_elems:
        data = {}
        name = element.find('h3').getText()
        pricing = element.find(class_='pricing multiple-prices list-unstyled')
        #msrp = None if pricing.find(class_='msrp') is None else pricing.find(class_='msrp').getText()
        #abSub = None if pricing.find(class_='abSub') is None else pricing.find(class_='abSub').getText()
        #internetPrice = None if pricing.find(class_='internetPrice') is None else pricing.find(class_='internetPrice').getText()
        #finalPrice = None if pricing.find(class_='stackedFinal final-price') is None else pricing.find(class_='stackedFinal final-price').getText()
        #print(name,msrp,abSub,internetPrice,finalPrice)
        data['scrapeDate']=date.today().strftime("%d/%m/%Y")
        data['name'] = name
        data['msrp'] = getclassvalue(pricing,'msrp')
        data['abSub'] = getclassvalue(pricing,'abSub')
        data['internetPrice'] = getclassvalue(pricing,'internetPrice')
        data['finalPrice'] = getclassvalue(pricing,'finalPrice')
        description = None if element.find(class_='description') is None else element.find(class_='description')
        #description = element.find(class_='description').find_all(['dt','dd'])
        #print(description)
        for dt_el, dd_el in zip(*(iter(description.find_all(['dt','dd'])),) * 2):
            data[dt_el.text] = dd_el.text
            #print(dt_el.text,"......",dd_el.text)
        #print(data)
        inventory.append(data)
    #return inventory

In [6]:
def writedata(inventory,filename):
    inventoryDF = pd.DataFrame(inventory)
    inventoryDF.sort_values('VIN: ',inplace=True)
    path = os.getcwd()+"/data/"+filename
    inventoryDF.to_csv(path,index=False)
    print('data written in '+path)

In [7]:
pages = [str(i) for i in range(0,1000,16)]
inventory = []
for page in pages:
    url = r'https://www.suburbanford.com/new-inventory/index.htm?start='+page+'&'
    print(url)
    if scrapeallowed(url):
        bs = getpagedata(url)
        getdetails(bs)
        filename = 'suburbanford_inventory_'+date.today().strftime("%d%m%Y")+'.csv'
        writedata(inventory,filename)
    else:
        print('scraping not allowed for this url...'+url)
    time.sleep(10)

https://www.suburbanford.com/new-inventory/index.htm?start=0&
The robots file read is at: https://www.suburbanford.com/new-inventory/index.htm?start=0&/robots.txt
The requested URL https://www.suburbanford.com/new-inventory/index.htm?start=0& can be crawled: True
data written in C:\Users\t7659dk\Documents\Deepak\mydata\IUMSDS\Semester 7 - Fall 2019\Web Scraping\Project/data/suburbanford_inventory_03112019.csv
https://www.suburbanford.com/new-inventory/index.htm?start=16&
The robots file read is at: https://www.suburbanford.com/new-inventory/index.htm?start=16&/robots.txt
The requested URL https://www.suburbanford.com/new-inventory/index.htm?start=16& can be crawled: True
data written in C:\Users\t7659dk\Documents\Deepak\mydata\IUMSDS\Semester 7 - Fall 2019\Web Scraping\Project/data/suburbanford_inventory_03112019.csv
https://www.suburbanford.com/new-inventory/index.htm?start=32&
The robots file read is at: https://www.suburbanford.com/new-inventory/index.htm?start=32&/robots.txt
The re

data written in C:\Users\t7659dk\Documents\Deepak\mydata\IUMSDS\Semester 7 - Fall 2019\Web Scraping\Project/data/suburbanford_inventory_03112019.csv
https://www.suburbanford.com/new-inventory/index.htm?start=320&
The robots file read is at: https://www.suburbanford.com/new-inventory/index.htm?start=320&/robots.txt
The requested URL https://www.suburbanford.com/new-inventory/index.htm?start=320& can be crawled: True
data written in C:\Users\t7659dk\Documents\Deepak\mydata\IUMSDS\Semester 7 - Fall 2019\Web Scraping\Project/data/suburbanford_inventory_03112019.csv
https://www.suburbanford.com/new-inventory/index.htm?start=336&
The robots file read is at: https://www.suburbanford.com/new-inventory/index.htm?start=336&/robots.txt
The requested URL https://www.suburbanford.com/new-inventory/index.htm?start=336& can be crawled: True
data written in C:\Users\t7659dk\Documents\Deepak\mydata\IUMSDS\Semester 7 - Fall 2019\Web Scraping\Project/data/suburbanford_inventory_03112019.csv
https://www.s

data written in C:\Users\t7659dk\Documents\Deepak\mydata\IUMSDS\Semester 7 - Fall 2019\Web Scraping\Project/data/suburbanford_inventory_03112019.csv
https://www.suburbanford.com/new-inventory/index.htm?start=640&
The robots file read is at: https://www.suburbanford.com/new-inventory/index.htm?start=640&/robots.txt
The requested URL https://www.suburbanford.com/new-inventory/index.htm?start=640& can be crawled: True
data written in C:\Users\t7659dk\Documents\Deepak\mydata\IUMSDS\Semester 7 - Fall 2019\Web Scraping\Project/data/suburbanford_inventory_03112019.csv
https://www.suburbanford.com/new-inventory/index.htm?start=656&
The robots file read is at: https://www.suburbanford.com/new-inventory/index.htm?start=656&/robots.txt
The requested URL https://www.suburbanford.com/new-inventory/index.htm?start=656& can be crawled: True
data written in C:\Users\t7659dk\Documents\Deepak\mydata\IUMSDS\Semester 7 - Fall 2019\Web Scraping\Project/data/suburbanford_inventory_03112019.csv
https://www.s

data written in C:\Users\t7659dk\Documents\Deepak\mydata\IUMSDS\Semester 7 - Fall 2019\Web Scraping\Project/data/suburbanford_inventory_03112019.csv
https://www.suburbanford.com/new-inventory/index.htm?start=960&
The robots file read is at: https://www.suburbanford.com/new-inventory/index.htm?start=960&/robots.txt
The requested URL https://www.suburbanford.com/new-inventory/index.htm?start=960& can be crawled: True
data written in C:\Users\t7659dk\Documents\Deepak\mydata\IUMSDS\Semester 7 - Fall 2019\Web Scraping\Project/data/suburbanford_inventory_03112019.csv
https://www.suburbanford.com/new-inventory/index.htm?start=976&
The robots file read is at: https://www.suburbanford.com/new-inventory/index.htm?start=976&/robots.txt
The requested URL https://www.suburbanford.com/new-inventory/index.htm?start=976& can be crawled: True
data written in C:\Users\t7659dk\Documents\Deepak\mydata\IUMSDS\Semester 7 - Fall 2019\Web Scraping\Project/data/suburbanford_inventory_03112019.csv
https://www.s