In [None]:
!pip install requests-cache

In [2]:
import requests
import numpy as np
import pandas as pd
import requests_cache
import lxml.html as html_parser
import re

In [3]:
MAX_PAGE = 333

In [4]:
## Get all the urls for all the listed used vehicles on truecar.com
def urls_scraping(base_url = 'https://www.truecar.com/used-cars-for-sale/listings/'):
    urls = []
    pages = []
    for i in range(1, MAX_PAGE+1):
        pages.append(base_url + '?page=' + str(i))
    for page in pages:
        try:
            response = requests.get(page)
            response.raise_for_status()
        except:
            break
        root = html_parser.fromstring(response.content)
        url = ['https://www.truecar.com' + link for link in root.xpath('//div[@data-test="usedListing"]/a/@href')]
        urls += url
    
    return urls

In [20]:
def page_scraping(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
    except:
        return
    root = html_parser.fromstring(response.content)
    
    # extract vehicle year, make and model information
    year_make_model = root.xpath('//h1[contains(@class,"heading-base")]/div[@class="heading-2"]/text()')[0].strip()
    year = year_make_model.split()[0]
    make_model = year_make_model.split()[1:]
    make = make_model[0]

    model = ''
    i = 1

    while i < len(make_model):
      model += make_model[i] + ' '
      i += 1
    
    model = model.strip()

    sub_model = root.xpath('//h1[contains(@class,"heading-base")]/div[@class="heading-base"]/text()')[0].strip()

    location = root.xpath('//div[@class="d-flex align-items-center padding-top-1"]/p/text()')[0].strip()
    city = location.split(',')[0].strip()
    state = location.split(',')[1].strip()

    mileage = root.xpath('//div[@class="d-flex flex-column margin-top-1"]/p/text()')[0].strip()

    price = root.xpath('//div[@data-test="vdpPreProspectPrice"]/text()')[0].strip()

    style = root.xpath('//div[@data-qa="Heading" and contains(., "Style")]/following-sibling::p/text()')[0].strip()
    exterior_color = root.xpath('//div[@data-qa="Heading" and contains(., "Exterior Color")]/following-sibling::p/text()')[0].strip()
    interior_color = root.xpath('//div[@data-qa="Heading" and contains(., "Interior Color")]/following-sibling::p/text()')[0].strip()
    engine = root.xpath('//div[@data-qa="Heading" and contains(., "Engine")]/following-sibling::p/text()')[0].strip()
    drive_type = root.xpath('//div[@data-qa="Heading" and contains(., "Drive Type")]/following-sibling::p/text()')[0].strip()
    fuel_type = root.xpath('//div[@data-qa="Heading" and contains(., "Fuel Type")]/following-sibling::p/text()')[0].strip()
    transmission = root.xpath('//div[@data-qa="Heading" and contains(., "Transmission")]/following-sibling::p/text()')[0].strip()

    accidents = root.xpath('//p[@class="_1crvurj" and contains(., "Accident")]/../preceding-sibling::div/text()')[0].strip()
    title = root.xpath('//p[@class="_1crvurj" and contains(., "Title")]/../preceding-sibling::div/text()')[0].strip()
    owners = root.xpath('//p[@class="_1crvurj" and contains(., "Owner")]/../preceding-sibling::div/text()')[0].strip()
    use_type = root.xpath('//p[@class="_1crvurj" and contains(., "Use Type")]/../preceding-sibling::div/text()')[0].strip()

    return pd.Series({'year':year, 'make':make, 'model':model, 'sub_model':sub_model, 'city':city, 'state':state, 'mileage':mileage,
                      'price':price, 'style':style, 'exterior_color':exterior_color, 'interior_color':interior_color, 
                      'engine':engine, 'drive_type':drive_type, 'fuel_type':fuel_type,'transmission':transmission, 'accidents':accidents,
                      'title':title, 'owners':owners,'use_type':use_type })

In [6]:
def scraping(urls):
    scraping_data = [page_scraping(url) for url in urls]
    return pd.concat(scraping_data, axis=1).T

In [7]:
%%time
urls=urls_scraping() # extract all vehicle urls from allowed 333 pages.
urls[:10]

CPU times: user 15.6 s, sys: 779 ms, total: 16.3 s
Wall time: 6min 4s


In [9]:
print(len(urls))

9993


In [10]:
data = scraping(urls)

In [17]:
data.shape

(9936, 20)

In [18]:
data.tail(5)


Unnamed: 0,year,make,model,sub_model,city,state,mileage,price,style,exterior_color,interior_color,engine,drive_type,fuel_type,transmission,accidents,title,owners,use_type,cpo
9931,2019,Toyota,Tacoma,TRD Sport Double Cab 5' Bed V6 2WD Automatic,Odessa,TX,20455,"$38,988",Pickup Truck,Barcelona Red Metallic,Graphite W/ Gun Metal,3.5L V-6 Gas,RWD,Gas,Automatic,0,Clean,2,Personal Use,True
9932,2019,Honda,CR-V,EX AWD,Brockton,MA,39071,"$29,296",SUV,Modern Steel Metallic,Gray,1.5L Inline-4 Gas Turbocharged,AWD,Gas,Automatic,0,Clean,1,Personal Use,True
9933,2016,Dodge,Journey,SE FWD,Frederick,MD,116443,"$9,995",SUV,White,Black,2.4L Inline-4 Gas,FWD,Gas,Automatic,0,Clean,1,Fleet Use,True
9934,2020,Toyota,Corolla,L CVT,Seffner,FL,28689,"$17,494",Sedan,Black Sand Pearl,Light Gray,1.8L Inline-4 Gas,FWD,Gas,Automatic,0,Clean,1,Personal Use,True
9935,2019,Toyota,Camry,L Automatic,Columbia,TN,62483,"$24,992",Sedan,Midnight Black Metallic,Ash,2.5L Inline-4 Gas,FWD,Gas,Automatic,0,Clean,1,Personal Use,True


In [19]:
data.to_csv('listing.csv', encoding='utf-8')