<a href="https://colab.research.google.com/github/chongna95/Mudah-Condo-KL-Rental-Analyst/blob/main/beautifulsoup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [120]:
!pip install requests-html
!pip install requests
!pip install beautifulsoup4
!pip install pandas
!pip install tqdm



In [121]:
import requests
from bs4 import BeautifulSoup
import json
import re
import pandas as pd
from tqdm import tqdm

data=[]
page = 1

# tqdm with manual update
pbar = tqdm(desc="Scraping pages", unit="page")

while True:
  url=f"https://www.mudah.my/kuala-lumpur/apartment-condominium-for-rent?o={page}"
  response=requests.get(url)
  soup=BeautifulSoup(response.text,"html.parser")

  listings=soup.find_all('div',class_="w_100% p_12px_16px d_flex flex-d_column jc_space-between ai_stretch")

  # Stop if no more listings
  if not listings:
      break

  for listing in listings:

      # Extract property name (Remove text in parentheses
      property_name = listing.find('h3', class_='c_black').get_text(strip=True)
      property_name=re.sub(r"\s*\(.*?\)", "", property_name)
      property_name=property_name.split(',')[0]
      if '@' in property_name:
          property_name=property_name.split('@')[0]
      else:
          property_name=property_name

      # Extract area(Remove text in parentheses)
      area=listing.find('h3', class_='c_black').get_text(strip=True)
      area=re.sub(r"\s*\(.*?\)", "", area)
      area=area.split(',')[1] if len(area) > 1 else None

      # Extract size number only if followed by 'sq.ft'
      size_number = listing.find('span', class_='fs_sm lh_1.25rem font-style_normal fw_bold c_var(--mudah-colors-text-hi-emp)')
      size_unit = listing.find('span', class_='c_var(--mudah-colors-text-hi-emp) fs_sm lh_1.25rem font-style_normal fw_normal')
      if size_number and size_unit:
          number_text = size_number.get_text(strip=True)
          unit_text = size_unit.get_text(strip=True)
          if 'sq.ft' in unit_text:
            size = int(re.sub(r'[^\d]', '', number_text))
          else:
            size = None
      else:
        size = None

      # Extract Rental (Remove everything except digits)
      price_rm=listing.find('span', class_="currPrice")
      if price_rm:
          price_text = price_rm.get_text(strip=True)
          price = int(re.sub(r'[^\d]', '', price_text))
      else:
          price = None


      data.append({'Area': area,
                   'Property Name':property_name,
                   'Rental (MYR)':price,
                   'Size (Squared Feet)': size
                 })

  page += 1
  pbar.update(1)

pbar.close()
df=pd.DataFrame(data)

# replace any missing with 'None'
df.fillna("None", inplace=True)
df = df.dropna(subset=["Rental (MYR)","Size (Squared Feet)"])

#Group & Summarize
summary=df.groupby(["Area","Property Name"]).agg({"Size (Squared Feet)":"mean","Rental (MYR)":"mean"}).reset_index()
summary.rename(columns={"Size (Squared Feet)":"Average Size (Squared Feet)","Rental (MYR)":"Average Rental (MYR)"},inplace=True)
summary["Average Size (Squared Feet)"]=summary["Average Size (Squared Feet)"].round(0).astype(int)
summary["Average Rental (MYR)"]=summary["Average Rental (MYR)"].round(0).astype(int)
sorted_summary = summary.sort_values(by=["Average Rental (MYR)","Average Size (Squared Feet)"], ascending=[False,False])

sorted_summary




Scraping pages: 14page [01:09,  4.99s/page]
Scraping pages: 43page [02:13,  3.11s/page]


Unnamed: 0,Area,Property Name,Average Size (Squared Feet),Average Rental (MYR)
348,Mont Kiara,10 Mont Kiara,3720,14000
231,KL City,One KL,3285,14000
359,Mont Kiara,Serene Mont Kiara,2918,14000
358,Mont Kiara,Pavilion Hilltop,2767,13000
39,Bangsar,The Loft,3800,12888
...,...,...,...,...
274,Kepong,Desa Satu,651,750
193,Desa Petaling,Desa Petaling Flat,600,750
195,Desa Petaling,Desa Sri Puteri B Apartments,650,700
165,Cheras,Taman Bukit Anggerik,603,600


In [124]:
sorted_summary.to_csv('mudah_rental_data.csv', index=False)

In [126]:
df=df.sort_values(by="Rental (MYR)", ascending=False)
df

Unnamed: 0,Area,Property Name,Rental (MYR),Size (Squared Feet)
786,Desa ParkCity,Park Place,18000,2390
44,KL City,One KL,14000,3285
1376,KL City,One KL,14000,3285
1461,KL City,One KL,14000,3285
733,KL City,One KL,14000,3285
...,...,...,...,...
1241,Ampang,Pandan Court,600,880
1379,Cheras,Taman Bukit Anggerik,600,603
537,Cheras,Cheras Ria,600,570
1453,Kuala Lumpur,Setapak,590,590


In [127]:
df.to_csv('rental.csv', index=False)