In [8]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

In [9]:
# URL of the webpage you want to scrape
url = 'https://www.trurotoyota.com/en/new-inventory'


In [10]:
# Initialize the Chrome WebDriver
driver = webdriver.Chrome()

# Open the webpage
driver.get(url)

# Wait for the dynamic content to load
wait = WebDriverWait(driver, 10)

# Find the div elements with the specified class
specific_divs = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'listing-new-tile-wrapper.false')))

# Extract data from the elements
div_texts = [div.text for div in specific_divs]


# Close the WebDriver
driver.quit()


In [11]:
div_texts


['2024 CAMRY SE UPGRADE\nFront Wheel DriveAutomatic2.5L 4cyl.\nMSRP\nPurchase Price (Cash)\n$35,868\nFreight, PDI & Other Fees Excluded\nSelling Price\n*$35,868\nShow Legal\nVIN 4T1S11AK1RU259982\nINSTALLED OPTIONS\nSupersonic Red\nStock #7761\n+ 22',
 '2024 RAV4 LIMITED\nAll Wheel DriveAutomatic2.5L 4cyl.\nMSRP\nPurchase Price (Cash)\n$46,803\nFreight, PDI & Other Fees Excluded\nSelling Price\n*$46,803\nShow Legal\nVIN 2T3D1RFV3RW443323\nINSTALLED OPTIONS\nLimited\nBlueprint\nStock #7641',
 '2024 VENZA HYBRID XLE\nAll Wheel DriveAutomatic2.5L 4cyl.\nMSRP\nPurchase Price (Cash)\n$50,843\nFreight, PDI & Other Fees Excluded\nSelling Price\n*$50,843\nShow Legal\nVIN JTEAAAAH2RJ170438\nINSTALLED OPTIONS\nTitanium Glow\nStock #7769\n+ 24',
 '2024 TACOMA TRD SPORT PLUS (6M)\nFour-Wheel DriveManual2.4L TURBO 4cyl.\nMSRP\nPurchase Price (Cash)\n$54,393\nFreight, PDI & Other Fees Excluded\nSelling Price\n*$54,393\nShow Legal\nVIN 3TYLE5JN8RT021098\nINSTALLED OPTIONS\nUnderground\nStock #7760\n+

In [12]:
text_data = [div_texts[_].split('\n') for _ in range(len(div_texts
))]


In [13]:
text_data 


[['2024 CAMRY SE UPGRADE',
  'Front Wheel DriveAutomatic2.5L 4cyl.',
  'MSRP',
  'Purchase Price (Cash)',
  '$35,868',
  'Freight, PDI & Other Fees Excluded',
  'Selling Price',
  '*$35,868',
  'Show Legal',
  'VIN 4T1S11AK1RU259982',
  'INSTALLED OPTIONS',
  'Supersonic Red',
  'Stock #7761',
  '+ 22'],
 ['2024 RAV4 LIMITED',
  'All Wheel DriveAutomatic2.5L 4cyl.',
  'MSRP',
  'Purchase Price (Cash)',
  '$46,803',
  'Freight, PDI & Other Fees Excluded',
  'Selling Price',
  '*$46,803',
  'Show Legal',
  'VIN 2T3D1RFV3RW443323',
  'INSTALLED OPTIONS',
  'Limited',
  'Blueprint',
  'Stock #7641'],
 ['2024 VENZA HYBRID XLE',
  'All Wheel DriveAutomatic2.5L 4cyl.',
  'MSRP',
  'Purchase Price (Cash)',
  '$50,843',
  'Freight, PDI & Other Fees Excluded',
  'Selling Price',
  '*$50,843',
  'Show Legal',
  'VIN JTEAAAAH2RJ170438',
  'INSTALLED OPTIONS',
  'Titanium Glow',
  'Stock #7769',
  '+ 24'],
 ['2024 TACOMA TRD SPORT PLUS (6M)',
  'Four-Wheel DriveManual2.4L TURBO 4cyl.',
  'MSRP',
  

In [14]:
# Extracting required information
extracted_data = []
for sublist in text_data:
    price = None
    vin = None
    for item in sublist:
        if item.startswith('$'):
            price = item
        elif item.startswith('VIN'):
            vin = item.split()[-1]
    extracted_data.append([sublist[0], sublist[1], price, vin])

# Converting to DataFrame
df = pd.DataFrame(extracted_data, columns=['Model', 'Drive', 'Price', 'VIN'])

In [16]:
df.to_csv('temp.csv')

In [None]:

response = requests.get(url)

# 使用 BeautifulSoup 解析 HTML 内容
soup = BeautifulSoup(response.text, "html.parser")

# 找到包含车辆信息的 <script> 标签
json_scripts = soup.find_all("script", type="application/ld+json")

# 定义一个空列表，用于存储提取的车辆信息
vehicles = []

# 定义一个集合，用于存储已经见过的车辆名称
seen_vehicles = set()


In [None]:

# 循环遍历每个 <script> 标签
for script in json_scripts:
    # 获取 <script> 标签的文本内容
    json_text = script.string
    
    # 如果文本内容存在
    if json_text:
        # 移除特殊字符
        json_text_cleaned = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', json_text)
        
        # 解析 JSON 数据为 Python 字典对象
        json_data = json.loads(json_text_cleaned)
        
        # 如果 JSON 数据是车辆清单（ItemList）
        if "@type" in json_data and json_data["@type"] == "ItemList":
            # 获取车辆列表
            vehicle_list = json_data["itemListElement"]
            
            # 遍历每个车辆条目
            for vehicle_item in vehicle_list:
                # 获取车辆信息
                vehicle_info = vehicle_item["item"]
                
                # 获取车辆名称
                vehicle_name = vehicle_info["name"]
                
                # 如果车辆名称不在集合中，说明是新的车辆信息，添加到列表中，并将车辆名称添加到集合中
                if vehicle_name not in seen_vehicles:
                    vehicles.append(vehicle_info)
                    seen_vehicles.add(vehicle_name)

# 打印输出提取的车辆信息
for vehicle in vehicles:
    print("车辆名称:", vehicle["name"])
    print("车辆价格:", vehicle["offers"]["price"], vehicle["offers"]["priceCurrency"])
    print()  # 添加空行，方便阅读

len(vehicles)

0