## Import necessary librarys.

In [2]:
import requests
import bs4
import pandas as pd
import csv
import pprint

### Create helper functions.
scrape_data(url) scrappes data from a url passed as parameter and returns a list of dictionaries representing the data for each sold property.

In [3]:
def scrape_data(url):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch {url}")
        return []
        
    soup = bs4.BeautifulSoup(response.text, "html.parser")
    sold_properties = soup.find_all("tr")
    sold_span = soup.find("span", class_="sold")
    sale_date = sold_span.get_text(strip=True) if sold_span else None
    #counter = 0
    data = []
    current_property = {}
    for row in sold_properties:
        tds = row.find_all("td")
        # Reject malformed entries
        if len(tds) != 2:
            continue
        # Grab the Column name (key) and value (value)
        key = tds[0].get_text(strip=True).replace(":","")
        value = tds[1].get_text(strip=True)
        if key == "Property Details":
            # Split string on semicolon
            details = value.split(";")
            for item in details:
                item = item.strip()
                if ":" in item:
                    sub_key, sub_value = item.split(":")
                    sub_key.strip()
                    sub_value.strip()
                    if sub_key == "Style":
                        current_property["Stories"] = sub_value
                    else:
                        current_property[sub_key] = sub_value
                else:
                    current_detail = item.split(" ")
                    if current_detail[1] == "Bathroom" or current_detail[1] == "Bathrooms" :
                        current_property["Bathrooms"] = current_detail[0]
                    elif current_detail[1] == "Bedroom" or current_detail[1] == "Bedrooms":
                        current_property["Bedrooms"] = current_detail[0]
        #if counter % 7 == 0:
            #print("--------------------------------------")
            #print("--------------------------------------")
        #print(f"{key} : {value}")
        current_property[key] = value

        if key == "First Time Buyer":
            if sale_date:
                current_property["Sale Date"] = sale_date
            data.append(current_property)
            current_property = {}
        #counter += 1
    return data

### Loop through all webpages in the 2021 dataset.

In [4]:
base_url = "http://mlg.ucd.ie/modules/python/assign1/property/2021-page{:02d}.html"
data_2021 = []

for page_num in range(1, 18):
    url = base_url.format(page_num)
    page_data = scrape_data(url)
    data_2021.extend(page_data)



### Convert to dataframe

In [6]:
columns = ["Sale Date", "Sale Price", "Location", "Year Built", "Garden", "Garage","Type", "Stories", "Bedrooms", "Bathrooms" , "First Time Buyer"]
df = pd.DataFrame(data_2021, columns=columns)
df.to_csv("data/2021_property_sales_data.csv")

### Lets now scrape the 2022 Property Sales

In [7]:
base_url = "http://mlg.ucd.ie/modules/python/assign1/property/2022-page{:02d}.html"
data_2022 = []

for page_num in range(1, 17):
    url = base_url.format(page_num)
    page_data = scrape_data(url)
    data_2022.extend(page_data)

### Convert to dataframe

In [8]:
columns = ["Sale Date", "Sale Price", "Location", "Year Built", "Garden", "Garage","Type", "Stories", "Bedrooms", "Bathrooms" , "First Time Buyer"]
df = pd.DataFrame(data_2022, columns=columns)
df.to_csv("data/2022_property_sales_data.csv")

### Lets now scrape the 2023 data

In [9]:
base_url = "http://mlg.ucd.ie/modules/python/assign1/property/2023-page{:02d}.html"
data_2023 = []

for page_num in range(1, 19):
    url = base_url.format(page_num)
    page_data = scrape_data(url)
    data_2023.extend(page_data)

### Convert to dataframe

In [10]:
columns = ["Sale Date", "Sale Price", "Location", "Year Built", "Garden", "Garage","Type", "Stories", "Bedrooms", "Bathrooms" , "First Time Buyer"]
df = pd.DataFrame(data_2023, columns=columns)
df.to_csv("data/2023_property_sales_data.csv")

### Lets now scrape the 2024 data.

In [11]:
base_url = "http://mlg.ucd.ie/modules/python/assign1/property/2024-page{:02d}.html"
data_2024 = []

for page_num in range(1, 24):
    url = base_url.format(page_num)
    page_data = scrape_data(url)
    data_2024.extend(page_data)

### Convert to dataframe

In [12]:
columns = ["Sale Date", "Sale Price", "Location", "Year Built", "Garden", "Garage","Type", "Stories", "Bedrooms", "Bathrooms" , "First Time Buyer"]
df = pd.DataFrame(data_2024, columns=columns)
df.to_csv("data/2024_property_sales_data.csv")