# Web Scraper
## What is a web scraper?
A web scraper is a tool that scrapes websites for information/data.
Used in conjunction with a web crawler, a program can crawl the web and extract data from webpages and store it.

This is the same tooling that Google and other search engines utilize to be able to allow users to search for any web page.

## How can we utilize it?
We know 2 things:
1. Each manufacturer has their own website with listings for all their current vehicles
2. Each vehicle page on the manufacturers site has all the specs and features listed as well as additional information

With this knowledge, we can
- Deterministically parse out vehicle specs from manufacturers websites
- Reduce possibility for error by removing manual input
- Maintain an up to date vehicle listing for new vehicles from any manufacturer.
- Reduce the reliance on chrome data for new vehicles

In [1]:
import requests, html5lib, json, time, copy
import urllib.request
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait

In [2]:
class Debugger:
    def __init__(self):
        self.errors = []
        self.mismatchElements = []
        self.timestamp = int(time.time())
        
    def printElements(self, elements):
        try:
            [self.printElement(e) for e in elements]
        except  AttributeError:
            print("Dont worry, not a driver object. Remove this line")

    def printElement(self, element):
        try:
            print(element.text)
        except  AttributeError:
            print("Dont worry, not a driver object. Remove this line")

    def addErrors(self, error):
        self.errors.append(error)
    
    def logWeirdElement(self, element, url, model, make, origin):
        self.mismatchElements.append({
                "innerHTML": element.get_attribute('innerHTML'), 
                "outerHTML": element.get_attribute('outerHTML'),
                "element": element.text,
                "url": url,
                "make": make,
                "model": model,
                "origin": origin
            })
    
    def encounteredErrors(self):
        found = False
        if len(self.mismatchElements) > 0:
            print("Check Logs for Mismatched Elements")
            found = True
        if len(self.errors) > 0:
            print("Check Logs for Errors")
            found = True
        return found
    
    def printMismatchElements(self):
        for element in self.mismatchElements:
            print(element)
            
    def printErrors(self):
        for error in self.errors:
            print(error)
    
    def __saveToFile(self, data, filename):
        try:
            with open(filename, 'w') as outfile:
                json.dump(data, outfile)
        except Exception as e:
            print(f"Error Saving to File: {e}")
            self.addErrors({"error": repr(e), "origin": "__saveToFile()"})
    
    def __saveErrors(self):
        if len(self.errors) > 0:
            self.__saveToFile(self.errors, f"Logs/{self.timestamp}_errors.json")
    
    def __saveMismatchElements(self):
        if len(self.mismatchElements) > 0:
            self.__saveToFile(self.mismatchElements, f"Logs/{self.timestamp}_mismatchElements.json")
    
    def saveToLogs(self):
        self.__saveErrors()
        self.__saveMismatchElements()

In [3]:
class FileWriter:
    def __init__(self, debugger):
        self.filename = f'Results/{int(time.time())}_vehicleSpecs.json'
        self.debugger = debugger
        self.initialized = False
        
    def __initializeResultsFile(self):
        with open(self.filename, mode='w') as resultsFile:
            json.dump([], resultsFile)

    def appendToFile(self, vehicleSpecs):
        if not self.initialized:
            self.__initializeResultsFile()
            self.initialized = True
        try:
            currentData = []
            with open(self.filename, 'r') as resultsFile:
                currentData = json.load(resultsFile)
                currentData.append(vehicleSpecs)
            with open(self.filename, 'w') as resultsFile:
                json.dump(currentData, resultsFile, indent=4, sort_keys=True)
        except Exception as e:
            print(f"Couldnt save vehicle data to file: {e}")
            self.debugger.addErrors({"error": repr(e), "origin": "appendToFile"})
    
    def writeToDemoFile(self, specs):
        try:
            with open('Results/demo_vehicleSpecs.json', 'w') as outfile:
                json.dump(specs, outfile, indent=4, sort_keys=True)
        except Exception as e:
            self.debugger.addErrors(repr(e))

In [4]:
class FrontPageListingParser():
    def __init__(self, url, debugger=None, collectNewImages=False):
        self.vehicles = []
        self.url = url
        self.parsedModels = {}
        if debugger is not None:
            self.debugger = debugger
        self.__setupDriver()
        self.__clickVehicleDropDown()
        self.__setVehicleCategoryButtons()
        self.collectNewImages = collectNewImages
        
    def __setupDriver(self):
        self.driver = webdriver.Firefox()
        self.driver.implicitly_wait(5)
        self.driver.get(self.url)
        sleep(2)
        
    def __clickVehicleDropDown(self):
        python_button = self.driver.find_element_by_css_selector("button[data-view='select-vehicle']")
        if python_button.text == "Select Vehicle":
            python_button.click()
        else:
            raise ValueError(f"ERROR: Bad Button access: {python_button.text}")
        sleep(2)
        
    def __setVehicleCategoryButtons(self):
        vehicleCategories = self.driver.find_element_by_css_selector("ul[class='models']")
        self.vehicleCategoryButtons = vehicleCategories.find_elements_by_tag_name("li")
        sleep(3)
    
    def __validateVehicle(self, vehicleData):
        #Note: we can have it update the category here
        if vehicleData['model'] not in self.parsedModels.keys():
            self.parsedModels.update({vehicleData['model']: vehicleData})
            self.vehicles.append(vehicleData)
    
    def __borrowImage(self, li, model):
        filename = f"./Images/{model}.png"
        if self.collectNewImages:
            imageDiv = li.find_element_by_css_selector("div[class='vehicle-image-wrapper']")
            imageURL = imageDiv.find_element_by_css_selector("img").get_attribute("data-srcset")
            urllib.request.urlretrieve(self.url + imageURL, filename)
        return filename
    
    def getVehicleFrontPageListing(self):
        print("Getting Vehicle Listing...")
        for button in self.vehicleCategoryButtons:
            category = button.text
            button.click()
            parentList = self.driver.find_elements_by_css_selector("ul[class='vehicles']")
            for vehicleGroup in parentList:
                listItems = vehicleGroup.find_elements_by_css_selector("li")
                for li in listItems:
                    link = li.find_element_by_css_selector("a").get_attribute("href")
                    details = li.find_element_by_css_selector("p[class='model']").text
                    if link is not None and details != '':
                        vehicleData = {}
                        vehicleData["link"] = link
                        vehicleData["category"] = category
                        vehicleData["year"], vehicleData["model"] =  details.split(' ', 1)
                        vehicleData["image"] = self.__borrowImage(li, vehicleData["model"])
                        self.__validateVehicle(vehicleData)
        sleep(3)
        self.driver.quit()
        print(f"Retrieved {len(self.vehicles)} vehicles from the homepage {self.url}")
        return self.vehicles

In [10]:
# for each spec group parse the important info
## Vehicle Spec Parser Class here
class VehicleSpecParser:
    def __init__(self, make, model, year, url, debugger):
        self.make = make.lower()
        self.model = model.lower()
        self.year = year
        self.url = url
        self.debugger = debugger
        self.__setupDriver()
        self.__clickSpecsButton()
        self.__getVehicleFeatureCategories()
        self.__createRowTemplate()
        self.__createSeriesLists()
    
    def cleanup(self):
        self.driver.quit()
         
    def __setupDriver(self):
        self.driver = webdriver.Firefox()
        self.driver.implicitly_wait(5)
        self.driver.get(self.url)
        sleep(2)
    
    def __createRowTemplate(self):
        self.__rowTemplate = {}
        for s in self.series:
            self.__rowTemplate[s] = {}
    
    def __createSeriesLists(self):
        self.__seriesLists = {}
        for s in self.series:
            self.__seriesLists[s] = []
            
    def __clickSpecsButton(self):
        specsButton = self.driver.find_element_by_xpath("//*[text()='Specs']")
        specsButton.click()
        sleep(3)
        self.series = [s.text for s in self.driver.find_elements_by_css_selector("span[class='series']") if s.text != '']
        self.trims = [t.text for t in self.driver.find_elements_by_css_selector("div[class='trim']") if t.text != '']
        self.trimAndSeries = {}
        for i, s in enumerate(self.series):
            self.trimAndSeries[s] = self.trims[i]
        sleep(3)
        
    def __ignoreEmptys(self, x):
        if x.text != '':
            return True
        else:
            return False
        
    def __getVehicleFeatureCategories(self):
        collapseButton = self.driver.find_element_by_css_selector("button[data-di-id='#collapse-btn']")
        collapseButton.click()
        featuresElement = self.driver.find_element_by_css_selector("div[class='feature-accordions']")
        self.features = [f.text for f in filter(
            self.__ignoreEmptys, featuresElement.find_elements_by_css_selector("div[class='tcom-accordion']"))]
        sleep(3)

    def __findAndClickMore(self, row):
        # class="view-more"
        try:
            moreButton = row.find_element_by_css_selector("button[class='view-more']")
            if moreButton:
                morebutton.click()
        except Exception as e:
            self.debugger.addErrors(e)

    def __getFeatureElement(self, feature):
        try:
            if 'Warranty Information' in feature:
                featureElement = self.driver.find_element_by_css_selector(
                    f"div[class='tcom-accordion'][title='Warranty Information*']")
            else:
                featureElement = self.driver.find_element_by_css_selector(
                    f"div[class='tcom-accordion'][title='{feature}']")
            return featureElement
        except Exception as e:
            self.debugger.addErrors({"error": repr(e), "make": self.make, "model": self.model,  "url": self.url, "origin": "__getFeatureElement"})
            return self.driver

    def __revealFeatureElement(self, featureElement):
        try:
            featureButton = featureElement.find_element_by_css_selector("button")
            featureButton.click()
        except Exception as e:
            self.debugger.addErrors({"error": repr(e), "make": self.make, "model": self.model,  "url": self.url, "origin": "__getFeatureElement"})
        
    
    def __getValue(self, valueDriver):
        if valueDriver.text != '':
            return valueDriver.text
        return valueDriver.find_element_by_css_selector("i").get_attribute('aria-label')


    
    def __expandDescription(self, container):
        desc = container.find_element_by_css_selector("td[class='category-title']")
        self.__findAndClickMore(desc)
        return desc.text
    
    def __getGroupingDescription(self, featureGroup):
        try:
            groupingDescription = featureGroup.find_element_by_css_selector("td[class='category-title sub-header']").text
            if groupingDescription == "":
                return None
            return groupingDescription
        except:
            return None
    
    def __getDescription(self, categoryRow):
        return self.__expandDescription(categoryRow)
        

    def __getRowData(self, featureRow):
        rowData = {}
        values = featureRow.find_elements_by_css_selector("td[class='category-value']")
        for i, s in enumerate(self.series):
            rowData[s] = self.__getValue(values[i])
        return rowData
        
    def __getRows(self, featureRows, featureGroup):
        groupingDescription = self.__getGroupingDescription(featureGroup)
        seriesLists = copy.deepcopy(self.__seriesLists)
        
        for featureRow in featureRows:
            description = self.__getDescription(featureRow)
            data = self.__getRowData(featureRow)
            
            for series in data:
                seriesData = { "description": description, "subgroup": groupingDescription,"value": data[series] }
                seriesLists[series].append(seriesData)
        return seriesLists
        

    def __getFeatureData(self, featureElement):
        featureData = copy.deepcopy(self.__seriesLists)
        
        # a featureGroup is a table of related data within a feature
        # there can be multiple featureGroups, generally marked with a groupingDescription
        featureGroups = featureElement.find_elements_by_css_selector("table[class='feature-group']")
        for featureGroup in featureGroups:
            if featureGroup.text == '':
                self.debugger.logWeirdElement(featureGroup, self.url, self.model, self.make, "__getFeatureData: empty text")
                continue
            try:
                # featureRows is a row that has a title, and a column of data for each series
                featureRows = featureGroup.find_elements_by_css_selector("tr[class='category-container']")
                data = self.__getRows(featureRows, featureGroup)
                for series in data:
                    featureData[series].extend(data[series])
                 
                
            except Exception as e:
                self.debugger.logWeirdElement(featureGroup, self.url, self.model, self.make, "__getFeatureData: bad element access")
                self.debugger.addErrors(
                    {"error": repr(e), "make": self.make, "model": self.model,  "url": self.url, "origin": "__getFeatureData: bad element access"})
                continue
        return featureData
    
    def parseSpecs(self):
        '''
        Given the web page for a specific vehicle,
        Parse out all vehicle specs from the web page
        Divided out by: Feature > Category (row of data in a feature) > spec
        '''
        self.specs = {
            "Trims": self.trims, 
            "Series": self.series,
            "Model": self.model,
            "Make": self.make,
            "Year": self.year,
            "URL": self.url,
            "TrimAndSeries": self.trimAndSeries, 
            "FeaturesList": self.features,
            "Features": {}
        }
        startTime = time.time()
        print(f"{self.model}:\tParsing Vehicle Features")
        for feature in self.features:
            print(f"\t{self.model}:\tParsing Feature: {feature}")
            # Find the element containing the specific feature 
            featureElement = self.__getFeatureElement(feature)
            self.__revealFeatureElement(featureElement)
            self.specs["Features"][feature] = self.__getFeatureData(featureElement)
            # print(self.specs["Features"][feature])
        print(f"{self.model}:\t Parsing Completed in {(time.time() - startTime)/60} minutes")
        return self.specs


# Single Vehicle Parse Demo
- Note: occasionally the page doesnt load in time and it fails (i think?)
- Parses a singular vehicle (toyota yaris)

### Expected output:
```
yaris:	Parsing Vehicle Features
	yaris:	Parsing Feature: MPG/Other/Price
	yaris:	Parsing Feature: Exterior
	yaris:	Parsing Feature: Interior
	yaris:	Parsing Feature: Multimedia
	yaris:	Parsing Feature: Safety/Convenience
	yaris:	Parsing Feature: Mechanical/Performance
	yaris:	Parsing Feature: Dimensions
	yaris:	Parsing Feature: Weights/Capacities
	yaris:	Parsing Feature: Tires
	yaris:	Parsing Feature: Warranty Information* 61
yaris:	 Parsing Completed in 92.34145927429199 seconds
```

In [11]:
def singleVehicleParseDemo():
    demoDebugger = Debugger()
    fileWriter = FileWriter(demoDebugger)
    # Create the vehicle parser object
    vehicleSpecParser = VehicleSpecParser("Toyota", "Yaris", "2020", "https://www.toyota.com/yaris", demoDebugger)
    specs = [vehicleSpecParser.parseSpecs()]
    
    if demoDebugger.encounteredErrors():
        # demoDebugger.printErrors()
        demoDebugger.saveToLogs()
    fileWriter.writeToDemoFile(specs)
    vehicleSpecParser.cleanup()
    del demoDebugger
    del vehicleSpecParser
    print('\nCheck "Results/demo_vehicleSpecs.json" for results')

singleVehicleParseDemo()

yaris:	Parsing Vehicle Features
	yaris:	Parsing Feature: MPG/Other/Price
	yaris:	Parsing Feature: Exterior
	yaris:	Parsing Feature: Interior
	yaris:	Parsing Feature: Multimedia
	yaris:	Parsing Feature: Safety/Convenience
	yaris:	Parsing Feature: Mechanical/Performance
	yaris:	Parsing Feature: Dimensions
	yaris:	Parsing Feature: Weights/Capacities
	yaris:	Parsing Feature: Tires
	yaris:	Parsing Feature: Warranty Information* 78
yaris:	 Parsing Completed in 1.4174623449643453 minutes
Check Logs for Mismatched Elements
Check Logs for Errors
name 'morebutton' is not defined
name 'morebutton' is not defined
name 'morebutton' is not defined
name 'morebutton' is not defined
name 'morebutton' is not defined
name 'morebutton' is not defined
name 'morebutton' is not defined
name 'morebutton' is not defined
name 'morebutton' is not defined
name 'morebutton' is not defined
name 'morebutton' is not defined
name 'morebutton' is not defined
name 'morebutton' is not defined
name 'morebutton' is not def

---
# Front Page Vehicle Listing Demo:
Parses the vehicle listing from the home page ofr toyota.com


In [7]:
def frontPageVehicleListingDemo():
    demoDebugger = Debugger()
    # Parse the front page for a vehicleListing
    frontPageParser = FrontPageListingParser("https://www.toyota.com", demoDebugger)
    vehicles = frontPageParser.getVehicleFrontPageListing()
    return vehicles
    del demoDebugger
    del frontPageParser

# Here you can playground with the class
v = frontPageVehicleListingDemo()

---

# Integration for all Vehicles:

This will go through every retrieved vehicle from the front page and attempt to parse the data

We call on the vehicle spec parser object with the following params:
- Make
- Model
- Model Year
- Link to the model page (retrieved from the data in the front page parsed data)

In [122]:
# Parse the front page for a vehicleListing
def main():
    debug = Debugger()
    frontPageParser = FrontPageListingParser(url="https://www.toyota.com", collectNewImages=False)
    vehicles = frontPageParser.getVehicleFrontPageListing()

    startTime = time.time()

    fileWriter = FileWriter(debug)
    
    for v in vehicles:
        print(f'{v["model"]} - {v["year"]} - {v["link"]}')
        vehicleSpecs = {}
        try:
            vehicleSpecParser = VehicleSpecParser("Toyota", v["model"], v["year"], v["link"], debug)
            vehicleSpecs = vehicleSpecParser.parseSpecs()
            vehicleSpecs.update({"image": v["image"], "category": v["category"]})
            vehicleSpecParser.cleanup()
        except Exception as e:
            print(f'Model: {v["model"]} Error Occurred {e}')
            debug.addErrors({"error": repr(e), "make": "Toyota", "model": v["model"],  "url": v["link"], "origin": "main()"})
            vehicleSpecParser.cleanup()
        fileWriter.appendToFile(vehicleSpecs)
        sleep(5)

    print(f"\n\nProcessing Completed in {(time.time() - startTime)/60} minutes")   
    debug.saveToLogs()

In [123]:
main()

Getting Vehicle Listing...
Retrieved 30 vehicles from the homepage https://www.toyota.com
Yaris - 2020 - https://www.toyota.com/yaris
yaris:	Parsing Vehicle Features
	yaris:	Parsing Feature: MPG/Other/Price
	yaris:	Parsing Feature: Exterior
	yaris:	Parsing Feature: Interior
	yaris:	Parsing Feature: Multimedia
	yaris:	Parsing Feature: Safety/Convenience
	yaris:	Parsing Feature: Mechanical/Performance
	yaris:	Parsing Feature: Dimensions
	yaris:	Parsing Feature: Weights/Capacities
	yaris:	Parsing Feature: Tires
	yaris:	Parsing Feature: Warranty Information* 63
yaris:	 Parsing Completed in 1.1155525684356689 minutes
Yaris Hatchback - 2020 - https://www.toyota.com/yarishatchback
yaris hatchback:	Parsing Vehicle Features
	yaris hatchback:	Parsing Feature: MPG/Other/Price
	yaris hatchback:	Parsing Feature: Exterior
	yaris hatchback:	Parsing Feature: Interior
	yaris hatchback:	Parsing Feature: Multimedia
	yaris hatchback:	Parsing Feature: Safety/Convenience
	yaris hatchback:	Parsing Feature: M

	sienna:	Parsing Feature: Tires
	sienna:	Parsing Feature: Packages
	sienna:	Parsing Feature: Warranty Information* 88
sienna:	 Parsing Completed in 2.0648432970046997 minutes
Tacoma - 2020 - https://www.toyota.com/tacoma
tacoma:	Parsing Vehicle Features
	tacoma:	Parsing Feature: MPG/Other/Price
	tacoma:	Parsing Feature: Exterior
	tacoma:	Parsing Feature: Interior
	tacoma:	Parsing Feature: Multimedia
	tacoma:	Parsing Feature: Safety/Convenience
	tacoma:	Parsing Feature: Mechanical/Performance
	tacoma:	Parsing Feature: Dimensions
	tacoma:	Parsing Feature: Weights/Capacities
	tacoma:	Parsing Feature: Towing
	tacoma:	Parsing Feature: Tires
	tacoma:	Parsing Feature: Options/Packages
	tacoma:	Parsing Feature: Warranty Information* 71
tacoma:	 Parsing Completed in 1.4854854583740233 minutes
Tundra - 2020 - https://www.toyota.com/tundra
tundra:	Parsing Vehicle Features
	tundra:	Parsing Feature: MPG/Other/
Price
	tundra:	Parsing Feature: Exterior
Model: Tundra Error Occurred Message: Element <b

# Future Applications
- Generate a keyword list so the data access can be a bit more dynamic
- Use headless chrome so a window doesnt pop up for each vehicle
- Using a single driver rather than one for each vehicle page (unless we have this run in parallel)
- Ability to parse all manufacturers websites
- Fresh manufacturer data daily
- Rather than have a data steward manually parse the data themselves, they can actually validate the data being pulled by the scraper.
    - this reduces the possibility for error
    - frees up time for the data stewards to get more of the important work done
- logging the errors that occur help us analyze and improve the scraping process
