In [18]:
import urllib.request
url = "http://mlg.ucd.ie/modules/python/assign1/cars/BMW-page01.html"

import urllib.request
import urllib.error

try:
    # trying to open the URL I gave
    response = urllib.request.urlopen(url)
    # read the response data and decode it into a string
    html = response.read().decode("utf-8")
    # display the HTML string
    print(html)
    #now try to catch any errors
except urllib.error.HTTPError as e:
    print(f"HTTP Error {e.code}: {e.reason}")
    html = None
except urllib.error.URLError as e:
    print(f"Network Error: {e.reason}")
    html = None
except Exception as e:
    print(f"Unexpected error: {e}")
    html = None

<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="utf-8">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <meta http-equiv="X-UA-Compatible" content="IE=edge">
  <meta name="robots" content="noindex">  
  <meta name="description" content="Audi car sales records - Browse used Audi vehicles with detailed specifications, prices, and sale information for educational purposes.">
  <title>Audi Car Sales Records - Page 1 of 20</title>
  <link rel="stylesheet" href="http://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css">
  <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.3/jquery.min.js"></script>
  <script src="http://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/js/bootstrap.min.js"></script>
  <link rel="stylesheet" type="text/css" href="style.css">
</head>
<body>
    <div class="container">
        <main>
          <div class="row">
            <div class="col-md-12 page-top">
              <h1><a href="index.html">Car Database</a> &md

now that our html file was successfully loaded, we can parse using a BeatifulSoup project to maintain the structure of the html page 

In [82]:
import bs4 

parser = bs4.BeautifulSoup(html, "html.parser")

#list to store results 
cars = []

#now find all the car names with the li tag 
carMatches = parser.find_all("li", {"class": "car-item"})

#now check for matches
if not carMatches: 
    print("No car records found")
#the important part: find necessary info for each car to extract and put into a dictionary for serialization
else:
    print(f"Processing {len(carMatches)} car records...")
    for i, match in enumerate(carMatches, 1):
        # within each car match, find appropriate tags and extract the text from each one
        carname_elem = match.find("h3", {"class": "make-model"})
        carname = carname_elem.text.strip()

        '''
        Annoyingly, the classes all have the same name (unlike what we worked on in the labs.)
        So now we have to define a method to find the actual values in the middle of all the extra stuff 
        using some methods I found in the BeautifulSoup Library. It also makes sense to go ahead and strip 
        the text from the values in this method instead of doing it at length later. 
        '''

        def findValue(labelText): 
            def matchesLabelText(text): 
                if text is None: 
                    return False 
                return text.strip() == labelText
                
            label_tdClass = match.find("td", string = matchesLabelText)
            if label_tdClass: 
                value_tdClass = label_tdClass.find_next_sibling("td") 
                return value_tdClass.get_text(strip=True) 
            return None 


        #now apply this method we wrote to the values that we know correspond with each stat about the car:
        saleprice = findValue("Sale Price:")
        saledate = findValue("Date of Sale:")
        year = findValue("Year:")
        mileage = findValue("Mileage:")
        classification = findValue("Classification:")
        transmission = findValue("Transmission:")
        fuel = findValue("Fuel Type:")
        description = findValue("Description:")
        location = findValue("Sale Location:")
        
        # clean the saleprice string so that it can be read as a numeric value
        salepriceReplacements = {"â‚¬": "", ",": "", ".00": ""}
        for old, new in salepriceReplacements.items(): 
            saleprice = saleprice.replace(old, new)

        #clean the mileage string because some of them say "miles" after it, which throws off data collection
        mileageReplacements = {" miles": "", ",": ""}
        for old, new in mileageReplacements.items(): 
            mileage = mileage.replace(old, new)
        
        # validate that we have non-empty data
        if not all([saleprice, year, mileage, classification, transmission, fuel, description, location]):
            print(f"Warning: Car record {i} contains empty fields")
            continue
        
        # add all of the information to a dictionary for ease of downloading later
        car = {"carname": carname, 
               "saleprice": saleprice, 
               "year": year, 
               "mileage": mileage, 
               "classification": classification, 
               "transmission": transmission, 
               "fuel": fuel, 
               "description": description, 
               "location": location}
        cars.append(car)
            
    print(f"Successfully extracted {len(cars)} complete car records")

Processing 20 car records...
Successfully extracted 20 complete car records


now we print all of our squeaky clean data

In [83]:
for i, car in enumerate(cars, 1):
    print(f"{i}. {car} \n")

1. {'carname': 'BMW 1 Series (116d)', 'saleprice': '9445', 'year': '2017', 'mileage': '41159', 'classification': 'Hatchback', 'transmission': 'Automatic', 'fuel': 'Diesel', 'description': 'Red BMW 1 Series. 1.5 116d SE Auto Euro 6 (s/s) 5dr. 3 previous owners', 'location': 'Kerry'} 

2. {'carname': 'BMW 3 Series (SE Business Edition)', 'saleprice': '6495', 'year': '2010', 'mileage': '103469', 'classification': 'Estate', 'transmission': 'Automatic', 'fuel': 'Diesel', 'description': 'Grey BMW 3 Series. 2.0 320d SE Business Edition Touring 5dr. 2 previous owners', 'location': 'Galway'} 

3. {'carname': 'BMW X1 (SE)', 'saleprice': '15429', 'year': '2017', 'mileage': '43571', 'classification': 'SUV', 'transmission': 'Manual', 'fuel': 'Diesel', 'description': 'BMW X1. xDrive 18d SE 5dr Diesel Estate 2.0', 'location': 'Donegal'} 

4. {'carname': 'BMW 8 Series (840d)', 'saleprice': '58023', 'year': '2020', 'mileage': '10505', 'classification': 'Convertible', 'transmission': 'Automatic', 'fuel'

Now let's export this to a CSV file for ease of access in making an ABT.

In [84]:
with open("carsales.csv", "w") as fout: 
    fout.write(f"Car Name; Sale Price; Year; Mileage; Classification; Transmission; Fuel Type; Description; Location \n")
    for car in cars:
        fout.write(f"{car["carname"]}; ")
        fout.write(f"{car["saleprice"]}; ")
        fout.write(f"{car["year"]}; ")
        fout.write(f"{car["mileage"]}; ")
        fout.write(f"{car["classification"]}; ")
        fout.write(f"{car["transmission"]}; ")
        fout.write(f"{car["fuel"]}; ")
        fout.write(f"{car["description"]}; ")
        fout.write(f"{car["location"]} ")
        fout.write("\n")
with open("carsales.csv", "r") as fin: 
    print(fin.read())

Car Name; Sale Price; Year; Mileage; Classification; Transmission; Fuel Type; Description; Location 
BMW 1 Series (116d); 9445; 2017; 41159; Hatchback; Automatic; Diesel; Red BMW 1 Series. 1.5 116d SE Auto Euro 6 (s/s) 5dr. 3 previous owners; Kerry 
BMW 3 Series (SE Business Edition); 6495; 2010; 103469; Estate; Automatic; Diesel; Grey BMW 3 Series. 2.0 320d SE Business Edition Touring 5dr. 2 previous owners; Galway 
BMW X1 (SE); 15429; 2017; 43571; SUV; Manual; Diesel; BMW X1. xDrive 18d SE 5dr Diesel Estate 2.0; Donegal 
BMW 8 Series (840d); 58023; 2020; 10505; Convertible; Automatic; Diesel; Black BMW 8 Series. 840d xDrive Convertible 3.0 2dr. 2 previous owners; Dublin 
BMW 4 Series Gran Coupe (420i); 20962; 2018; 33570; Hatchback; Automatic; Petrol; Black BMW 4 Series Gran Coupe. 420i M Sport Gran Coupe 2.0 5dr. 1 previous owner; Dublin 
BMW M5 (V8);  25443; 2014; 66377; Saloon; Automatic; Petrol; Blue BMW M5. 4.4 V8 DCT Euro 5 (s/s) 4dr. 6 previous owners; Cork 
BMW X3 (20d); 3733