In [98]:
import pandas as pd
import bs4
import os
import requests


In [99]:
def parse_description(desc):
    parts = desc.split('; ')
    type_ = None
    style = None
    bedrooms = None
    bathrooms = None
    for part in parts:
        if part.startswith('Type:'):
            type_ = part.replace('Type: ', '').strip()
        elif part.startswith('Style:'):
            style = part.replace('Style: ', '').strip()
        elif part.endswith('Bedroom') or part.endswith('Bedrooms'):
            bedrooms = part.split(' ')[0].strip()
        elif part.endswith('Bathroom') or part.endswith('Bathrooms'):
            bathrooms = part.split(' ')[0].strip()
    
    return type_, style,  bedrooms, bathrooms


In [100]:
all_data = []

# years and page counts
years = {
    2021: 15,
    2022: 17,
    2023: 17,
    2024: 23
}

for year, total_pages in years.items():
    for page in range(1, total_pages+1):
        url = f"http://mlg.ucd.ie/modules/python/assignment1/property/{year}-page{page:02d}.html"
        response = requests.get(url)
        soup = bs4.BeautifulSoup(response.content, 'html.parser')
        content_div = soup.find('div', id='content')
        if content_div:
            ol = content_div.find('ol')
            if ol:
                listings = ol.find_all('li')
                for li in listings:
                    sale_date = li.find('span', class_='sold').text.strip()
                    table = li.find('table', class_='sale')
                    if table:
                        rows = table.find_all('tr')
                        data = {'Year': year, 'Sale Date': sale_date}
                        for row in rows:
                            cols = row.find_all('td')
                            if len(cols) == 2:
                                label = cols[0].text.strip().replace(':', '').strip()
                                value = cols[1].text.strip()
                                if label == 'Description':
                                    type_, style, bedrooms, bathrooms = parse_description(value)
                                    data['Type'] = type_
                                    data['Style'] = style
                                    data['Bedrooms'] = bedrooms
                                    data['Bathrooms'] = bathrooms
                                else:
                                    data[label] = value
                        all_data.append(data)  # Move this line outside the row loop

df = pd.DataFrame(all_data)

df.head()

Unnamed: 0,Year,Sale Date,Sale Price,Property Location,Year Built,Garden,Garage,Type,Style,Bedrooms,Bathrooms,First Time Buyer
0,2021,Sold 2021-01-10,"€381,302.00",Broomhouse,1967,Yes,Yes,Detached,1.5-Storey,3,1,No
1,2021,Sold 2021-01-10,"€325,898.00",Broomhouse,1978,Yes,???,Detached,1-Storey,3,1,Yes
2,2021,Sold 18 January 2021,"€ 370,354",Oak Park,1961,Yes,No,Detached,1-Storey,3,2,No
3,2021,Sold 2021-01-23,"€92,480.00",Beacon Hill,1958,Yes,No,Bungalow,1-Storey,1,1,Yes
4,2021,Sold 2021-01-25,"€312,030.00",Brookville,1987,Yes,Yes,Detached,1-Storey,3,1,No


In [101]:
df['Sale Date'] = df['Sale Date'].str.slice(start=4, stop=None)
df['Sale Date'] = pd.to_datetime(df['Sale Date'], format="mixed")

In [102]:
df.head()

Unnamed: 0,Year,Sale Date,Sale Price,Property Location,Year Built,Garden,Garage,Type,Style,Bedrooms,Bathrooms,First Time Buyer
0,2021,2021-01-10,"€381,302.00",Broomhouse,1967,Yes,Yes,Detached,1.5-Storey,3,1,No
1,2021,2021-01-10,"€325,898.00",Broomhouse,1978,Yes,???,Detached,1-Storey,3,1,Yes
2,2021,2021-01-18,"€ 370,354",Oak Park,1961,Yes,No,Detached,1-Storey,3,2,No
3,2021,2021-01-23,"€92,480.00",Beacon Hill,1958,Yes,No,Bungalow,1-Storey,1,1,Yes
4,2021,2021-01-25,"€312,030.00",Brookville,1987,Yes,Yes,Detached,1-Storey,3,1,No


In [103]:
df['Sale Price'] = df['Sale Price'].str.slice(start=1, stop=None)
df['Sale Price'] = df['Sale Price'].str.replace(',','')

In [104]:
df.head()

Unnamed: 0,Year,Sale Date,Sale Price,Property Location,Year Built,Garden,Garage,Type,Style,Bedrooms,Bathrooms,First Time Buyer
0,2021,2021-01-10,381302.0,Broomhouse,1967,Yes,Yes,Detached,1.5-Storey,3,1,No
1,2021,2021-01-10,325898.0,Broomhouse,1978,Yes,???,Detached,1-Storey,3,1,Yes
2,2021,2021-01-18,370354.0,Oak Park,1961,Yes,No,Detached,1-Storey,3,2,No
3,2021,2021-01-23,92480.0,Beacon Hill,1958,Yes,No,Bungalow,1-Storey,1,1,Yes
4,2021,2021-01-25,312030.0,Brookville,1987,Yes,Yes,Detached,1-Storey,3,1,No
