<a href="https://colab.research.google.com/github/chongna95/Mudah-Condo-KL-Rental-Analyst/blob/main/beautifulsoup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [90]:
!pip install requests-html
!pip install requests
!pip install beautifulsoup4
!pip install pandas
!pip install tqdm



In [95]:
import requests
from bs4 import BeautifulSoup
import json
import re
import pandas as pd
from tqdm import tqdm

data=[]
page = 1

# tqdm with manual update
pbar = tqdm(desc="Scraping pages", unit="page")

while True:
  url=f"https://www.mudah.my/kuala-lumpur/apartment-condominium-for-rent?o={page}"
  response=requests.get(url)
  soup=BeautifulSoup(response.text,"html.parser")

  listings=soup.find_all('div',class_="w_100% p_12px_16px d_flex flex-d_column jc_space-between ai_stretch")

  # Stop if no more listings
  if not listings:
      break

  for listing in listings:

      # Extract property name (Remove text in parentheses
      property_name = listing.find('h3', class_='c_black').get_text(strip=True)
      property_name=re.sub(r"\s*\(.*?\)", "", property_name)
      property_name=property_name.split(',')[0]
      if '@' in property_name:
          property_name=property_name.split('@')[0]
      else:
          property_name=property_name

      # Extract area(Remove text in parentheses)
      area=listing.find('h3', class_='c_black').get_text(strip=True)
      area=re.sub(r"\s*\(.*?\)", "", area)
      area=area.split(',')[1] if len(area) > 1 else None

      # Extract size number only if followed by 'sq.ft'
      size_number = listing.find('span', class_='fs_sm lh_1.25rem font-style_normal fw_bold c_var(--mudah-colors-text-hi-emp)')
      size_unit = listing.find('span', class_='c_var(--mudah-colors-text-hi-emp) fs_sm lh_1.25rem font-style_normal fw_normal')
      if size_number and size_unit:
          number_text = size_number.get_text(strip=True)
          unit_text = size_unit.get_text(strip=True)
          if 'sq.ft' in unit_text:
            size = int(re.sub(r'[^\d]', '', number_text))
          else:
            size = None
      else:
        size = None

      # Extract Rental (Remove everything except digits)
      price_rm=listing.find('span', class_="currPrice")
      if price_rm:
          price_text = price_rm.get_text(strip=True)
          price = int(re.sub(r'[^\d]', '', price_text))
      else:
          price = None


      data.append({'Property Name':property_name,
                 'Area': area,
                 'Size (Squared Feet)': size,
                 'Rental (MYR)':price})

  page += 1
  pbar.update(1)

pbar.close()
df=pd.DataFrame(data)

# replace any missing with 'None'
df.fillna("None", inplace=True)

#Group & Summarize
summary=df.groupby(["Property Name","Area"]).agg({"Size (Squared Feet)":"mean","Rental (MYR)":"mean"}).reset_index()
summary.rename(columns={"Size (Squared Feet)":"Average Size (Squared Feet)","Rental (MYR)":"Average Rental (MYR)"},inplace=True)
summary["Average Size (Squared Feet)"]=summary["Average Size (Squared Feet)"].round(0)
summary["Average Rental (MYR)"]=summary["Average Rental (MYR)"].round(0)
sorted_summary = summary.sort_values(by=["Average Rental (MYR)","Average Size (Squared Feet)"], ascending=[False,False])

sorted_summary




Scraping pages: 53page [02:59,  3.38s/page]


Unnamed: 0,Property Name,Area,Average Size (Squared Feet),Average Rental (MYR)
2,10 Mont Kiara,Mont Kiara,3720.0,14000.0
316,One KL,KL City,3285.0,14000.0
476,Serene Mont Kiara,Mont Kiara,2918.0,14000.0
358,Pavilion Hilltop,Mont Kiara,2767.0,13000.0
577,The Loft,Bangsar,3800.0,12888.0
...,...,...,...,...
134,Desa Satu,Kepong,651.0,744.0
132,Desa Petaling Flat,Desa Petaling,600.0,733.0
137,Desa Sri Puteri B Apartments,Desa Petaling,650.0,700.0
544,Taman Bukit Anggerik,Cheras,603.0,600.0


In [96]:
sorted_summary.to_csv('rental_summary.csv', index=False)