# Data Acquisition

### Imports

In [68]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

### Web-Scraping Function

This function is scraping Cazoo, the British online car retailer, and pulling key vehicle characteristics for each listing such as: 
* Car Brand
* Car Model
* Price
* Mileage
* Registration Date
* Trim Specs
* Engine Specs
* Car Body Type

#### Extracting Function

In [69]:
def extract(page):
    '''
    This function returns the HTML files from a multi-listing search page allowing for the pulling of url links of individual listings. 
    The 'page' input enables the user to specify which multi-listing search page they want to target. This allows the user to loop through all multi-listing pages.
    '''
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}
    url = f'https://www.cazoo.co.uk/cars/?page={page}'
    request = requests.get(url, headers)
    soup = BeautifulSoup(request.content, 'html.parser')
    
    return soup

#### Transforming Function

In [70]:
def transform(soup):
    '''
    This function pulls the individual listing urls from the multi-listing search page inputted into the 'extract' function.
    This list of urls is then used to pull the HTML files from the individual listing web pages.
    The function then loops through all of the different individual listing web pages pulling key characteristic elements.
    The characteristics for each listing are then fed into a dictionary as an individual entry.
    '''
    car_links = soup.find_all('div', class_ = 'vehicle-cardstyles__InfoWrap-sc-1bxv5iu-2 feeGku')
    for car_link in car_links:
        link = car_link.find('a').get('href')
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}
        car_url = f'https://www.cazoo.co.uk{link}'
        car_requests = requests.get(car_url, headers)
        car_soup = BeautifulSoup(car_requests.content, 'html.parser')
        
        ## Pulling Car Model Element
        try:
            car_model = car_soup.find('h1', class_='sc-10u69o8-0 kpcZmo').text      
        except:
            car_model = ''
            
        ## Car Price
        try:
            car_price = car_soup.find('p', class_='Pricingstyles__TotalPriceValue-sc-1m171co-4 drnOgY').text
        except:
            try:
                car_price = car_soup.find('p', class_='Pricingstyles__TotalPriceValue-sc-1m171co-4 atSNp').text
            except: 
                car_price = ''
            
        ## Car Trim
        try:
            car_trim = car_soup.find('p', class_='sc-yrk414-0 jBtyVi').text
        except:
            car_trim = ''
            
        ## Car Registration Date
        try:
            car_reg_soup = car_soup.find('div', attrs = {'data-test-id' : 'registrationDate'})
            car_reg = car_reg_soup.find('dd', class_='list-label-valuestyles__StyledAttributeValue-opn4m9-3 aJytw').text
        except:
            car_reg = ''
            
        ## Car Mileage
        try:
            car_mileage_soup = car_soup.find('div', attrs = {'data-test-id' : 'odometerReading'})
            car_mileage = car_mileage_soup.find('dd', class_='list-label-valuestyles__StyledAttributeValue-opn4m9-3 aJytw').text
        except:
            car_mileage = ''
            
        ## Car Transmission
        try:
            car_transmission_soup = car_soup.find('div', attrs = {'data-test-id' : 'gearbox'})
            car_transmission = car_transmission_soup.find('dd', class_='list-label-valuestyles__StyledAttributeValue-opn4m9-3 aJytw').text
        except:
            car_transmission = ''
            
        ## Car Color
        try:
            car_color_soup = car_soup.find('div', attrs = {'data-test-id' : 'exteriorColour'})
            car_color = car_color_soup.find('dd', class_='list-label-valuestyles__StyledAttributeValue-opn4m9-3 aJytw').text
        except:
            car_color = ''
            
        ## Car Body
        try:
            car_body_soup = car_soup.find('div', attrs = {'data-test-id' : 'bodyType'})
            car_body = car_body_soup.find('dd', class_='list-label-valuestyles__StyledAttributeValue-opn4m9-3 aJytw').text
        except:
            car_body = ''
            
        ## Car Engine
        try:
            car_engine_soup = car_soup.find('div', attrs = {'data-test-id' : 'engineSize'})
            car_engine = car_engine_soup.find('dd', class_='list-label-valuestyles__StyledAttributeValue-opn4m9-3 aJytw').text
        except:
            car_engine = ''
        
        ## Create Dictionary
        car = {
            'model': car_model,
            'listing_price': car_price,
            'trim': car_trim,
            'reg_date': car_reg,
            'mileage': car_mileage,
            'transmission': car_transmission,
            'color': car_color,
            'car_body': car_body,
            'car_engine': car_engine,
            'car_page_url': car_url
        }
        car_list.append(car)
        
    return
    
car_list = []

#### Selecting the Final Listing Page Number

In [71]:
final_page_number = 1
final_page_element = ''
while final_page_element != 'Sorry, this search has no results':
    final_page_number += 1
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}
    url = f'https://www.cazoo.co.uk/cars/?page={final_page_number}'
    request = requests.get(url, headers)
    soup = BeautifulSoup(request.content, 'html.parser')
    try:
        final_page_element = soup.find('h2', class_ = 'sc-yrk414-0 ikGAuA').text
    except AttributeError:
        pass

print(final_page_number)

154


#### Launch Function and Save Results to a CSV file

In [72]:
for i in range(1, final_page_number):
    time.sleep(30)
    c = extract(i)
    transform(c)
    
df = pd.DataFrame(car_list)
df.to_csv('../datasets/cazoo_raw_data.csv')

7147
