In [45]:
import re
import requests
import datetime
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time

## Load agent names

In [67]:
# How to read_csv from google sheet https://medium.com/@Bwhiz/step-by-step-guide-importing-google-sheets-data-into-pandas-ae2df899257f
sheet_name = 'Agent' # replace with your own sheet name
sheet_id = '1QXAKFfXR8oBMXL_mVJx3SkqEpN0qIRG5davdcwCq1-0' # replace with your sheet's ID
url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"

agent_list = pd.read_csv(url)

agent_name_list = agent_list['agent name'].dropna().str.lower().tolist()
prefix_name = agent_list['prefix_name'].dropna().str.lower().tolist()
owner_pharses = agent_list['owner_pharses'].dropna().str.lower().tolist()
agent_pharses = agent_list['agent_pharses'].dropna().str.lower().tolist()
livinginsider_profiles = agent_list['livinginsider_profiles'].dropna().str.lower().tolist()

agent_list.head()

Unnamed: 0,prefix_name,agent name,owner_pharses,agent_pharses,livinginsider_profiles
0,property,Nut Ananganjanagit,เจ้าของขายเอง,co-agent,agent post
1,asset,eakapong_r,owner post,co agent,co agent
2,condo,Pluto_Landmarks,ownerpost,co-agent welcome,รหัสทรัพย์
3,estate,MaxTierRealty,welcome agent,co agent welcome,บริการฝากขาย
4,prop,suwit,agent welcome,welcome co agent,ที่ปรึกษาอสังหา


### Define functions

In [68]:
def is_agent(fullname: str, prefixes: list) -> bool:
    normalized_fullname = fullname.replace(" ", "").lower()
    return any(prefix.replace(" ", "").lower() in normalized_fullname for prefix in prefixes)

In [69]:
def extract_numbers(text: str) -> str:
    """Extracts numbers from text while keeping hyphens and dot."""
    return "".join(re.findall(r"[-\d.]+", text))

In [70]:
def clean_text(text):
    return text.replace("\n", "").replace(" ", "")

In [71]:
def check_content(content):
    content = content.replace("-", " ").lower()
    if 'co agent welcome' in content:
        return True # Agent's post
    if any(keyword in content for keyword in owner_pharses):
        return False # Owner's post
    if any(keyword in content for keyword in agent_pharses):
        return True # Agent's post
    return False # Might be owner's post

In [72]:
def check_profile(content):
    content = content.replace("-", " ").lower()
    if 'co agent welcome' in content:
        return True # Agent's post
    if any(keyword in content for keyword in owner_pharses):
        return False # Owner's post
    if any(keyword in content for keyword in livinginsider_profiles):
        return True # Agent's post
    return False # Might be owner's post

## Create a dictionary

In [117]:
room_dict = {
    "date": [],
    "condo_name":[],
    "building": [],
    "floor": [],
    "area": [],
    "bedroom": [],
    "bathroom": [],
    "price": [],
    "phone": [],
    "owner": [],
    "room_number": [],
    "line_id": [],
    "url": []
}

In [118]:
page_urls = ["https://www.livinginsider.com/searchword/Condo/Rent/25/%E0%B8%9A%E0%B8%B2%E0%B8%87%E0%B9%80%E0%B8%82%E0%B8%99.html",
            ]

## Scrapping function

### Condo

In [None]:
def search_room(url):    
    i = 1
    agent_counts = 1
    while True:
        page_content = requests.get(url)
        soup = BeautifulSoup(page_content.content, 'html.parser')
        posts = soup.find_all('div', class_="col-md-3 col-sm-4 col-xs-6 padding_topic margin_top_topic")
        print(f">>>>>>>>>>>>>>> Start Scrapping page {i}")
        # print(len(posts)) ### Uncomment for debugging
        for post in posts:
            link = post.find('a', class_="image-ratio-4-3").get('href')
            in_post_content = requests.get(link)
            in_post_soup = BeautifulSoup(in_post_content.content, 'html.parser')
            detail_content = in_post_soup.find('div', class_="detail-content col-sm-12 main-detail-content")
            if detail_content is None:
                print(f"In post content is None, skip this iteration !!!!!")
                continue
            title = detail_content.find('h1', class_="font_sarabun show-title").text
            owner = detail_content.find('label').text
            content = detail_content.find('p', class_='wordwrap').text
            if not any(keyword in title for keyword in owner_pharses):
                # >>>>>>>>>>>>>>>>>>>>>>>>>>>> Check prefix, agent names, content ===========================
                if owner.lower() in agent_name_list or is_agent(owner, prefix_name) or check_content(content):
                    print(f"Found {agent_counts} Agents : {owner} continue")
                    agent_counts += 1
                    continue
                else:
                    try:
                        onclick_value = in_post_soup.find('div', class_="box-star-detail").get('onclick')
                        profile_number = re.search(r"gotoReviewpage\('(\d+)'\)", onclick_value).group(1)
                        profile_url = f"https://www.livinginsider.com/profile/{profile_number}/member_info.html"
                        profile_page = requests.get(profile_url)
                        profile_soup = BeautifulSoup(profile_page.content, 'html.parser')
                        profile_content = profile_soup.find('div', class_="text-body-info-profile").text
                        if profile_content and check_profile(profile_content):
                            print(f"Found {agent_counts} Agents by checking profile: {owner} continue")
                            agent_counts += 1
                            continue
                    except Exception as e:
                        print(f"Extracting profile error: {e}")
                # <<<<<<<<<<<<<<<<<<<<<<<<<< End Check prefix, agent names, content ===========================
            try:
                condo_name = detail_content.find('span', class_="text_project_detail_green").text
                price_element = detail_content.find('div', class_="t-24 price-detail mt-0 price_topic")
                price = price_element.find('b').text
                detail_property = detail_content.find('div', class_="row detail-row")
                room_details = detail_property.find_all('div', class_="col-xs-6 col-sm-6 col-md-6 col-lg-3 detail-col-property-list")
            except Exception as e:
                print(f"Error occured while extracting one of these information condo_name, price, room_detail: {e}")

            properties = []
            for detail in room_details:
                detail_info = detail.find('span', class_='detail-property-list-text').text
                properties.append(clean_text(detail_info))

            room_dict['date'].append(datetime.datetime.now().strftime("%d/%m/%Y"))
            room_dict['condo_name'].append(condo_name)
            room_dict['building'].append(None)
            room_dict['floor'].append(properties[1])
            room_dict['area'].append(properties[0])
            room_dict['bedroom'].append(properties[2])
            room_dict['bathroom'].append(properties[3])
            room_dict['price'].append(extract_numbers(price).replace('.', ""))
            room_dict['phone'].append(None)
            room_dict['owner'].append(owner)
            room_dict['room_number'].append(None)
            room_dict['line_id'].append(None)
            room_dict['url'].append(link)

            print(f"Collected {len(room_dict['url'])} posts")

        pagination = soup.find('ul', class_="pagination")
        pages = pagination.find_all('a', class_="loading_page")
        next_page = pages[-1]
        print(next_page.text)
        if next_page.text != " Next ›" or i == 50:
            print(f"Break!!! End page: {i}")
            break
        else:
            url = next_page.get('href')
            print(f"Finish page :{i} ========================")
            i += 1

In [None]:
for page_url in page_urls:
    search_room(page_url)

## Filter by counting (< 4 is owner, >= 4 is agent)

In [108]:
OWNER_PATH = '../csv_files/livinginsider/owner_room.csv'
EXPECTED_AGENT_PATH = '../csv_files/livinginsider/expected_agent.csv'
EXPECTED_AGENT_LIST_PATH = '../csv_files/livinginsider/expected_agent_list.csv'

all_room_df = pd.DataFrame(room_dict)

# filler owner
filltered_df = all_room_df[all_room_df.groupby('owner')['owner'].transform("count") < 4]

# fillter agent
expected_agent_df = all_room_df[all_room_df.groupby('owner')['owner'].transform("count") >= 4]
expected_agent_list = pd.Series(expected_agent_df['owner'].value_counts())

# export to csv
filltered_df.to_csv(OWNER_PATH)
expected_agent_df.to_csv(EXPECTED_AGENT_PATH)
expected_agent_list.to_csv(EXPECTED_AGENT_LIST_PATH)

In [15]:
filltered_df.groupby("owner").size()

owner
ARMTRAIPOP              1
ART_JP                  1
Aileen KT               1
Amm_Anuchaivong         1
Amorntaep_Komutanon     1
                       ..
รัชดาวรรณ เพ็ญสุข...    1
รัตนชาติ...             1
สุดชา ผาผง...           1
เกรดสุรางค์...          2
โอปอชา_ลาลาลันล่า...    2
Length: 112, dtype: int64

In [16]:
expected_agent_df.groupby("owner").size()

owner
Natchaaa           13
Oilsata Wealthy     4
Tanya_Noi          10
surasak191XXX       4
dtype: int64

In [17]:
expected_agent_df

Unnamed: 0,date,condo_name,building,floor,area,bedroom,bathroom,price,phone,owner,room_number,line_id,url
1,10/04/2025,เคฟ ทาวน์ ไอส์แลนด์,,7,22ตร.ม.,1,1,12000,,Natchaaa,,,https://www.livinginsider.com/livingdetail/238...
2,10/04/2025,เทอร์ร่า เรสซิเดนซ์ เฟส 1 - 2,,21-50,30ตร.ม.,1,1,14000,,Natchaaa,,,https://www.livinginsider.com/livingdetail/255...
4,10/04/2025,เคฟ ทาวน์ ไอส์แลนด์,,8,23ตร.ม.,1,1,13000,,Natchaaa,,,https://www.livinginsider.com/livingdetail/246...
5,10/04/2025,ดีคอนโด ไฮป์ รังสิต,,1-4,24.50ตร.ม.,1,1,11000,,surasak191XXX,,,https://www.livinginsider.com/livingdetail/257...
6,10/04/2025,แอทโมซ คาแนล รังสิต,,3,28ตร.ม.,1,1,13000,,surasak191XXX,,,https://www.livinginsider.com/livingdetail/256...
7,10/04/2025,ดีคอนโด แคมปัส โดม - รังสิต,,1,30ตร.ม.,ห้องสตูดิโอ,1,9000,,surasak191XXX,,,https://www.livinginsider.com/livingdetail/256...
8,10/04/2025,นิว ครอส คูคต สเตชัน,,5,26.70ตร.ม.,1,1,10000,,surasak191XXX,,,https://www.livinginsider.com/livingdetail/257...
21,10/04/2025,เคฟ ทาวน์ ไอส์แลนด์,,4,38ตร.ม.,2,1,23000,,Natchaaa,,,https://www.livinginsider.com/livingdetail/239...
35,10/04/2025,ดีคอนโด ไฮป์ รังสิต,,5-10,26ตร.ม.,1,1,12000,,Natchaaa,,,https://www.livinginsider.com/livingdetail/253...
36,10/04/2025,เทอร์ร่า เรสซิเดนซ์ เฟส 1 - 2,,30,33ตร.ม.,1,1,12000,,Natchaaa,,,https://www.livinginsider.com/livingdetail/239...
