In [1]:
import json
import requests
import re
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import unicodedata
from tqdm import tqdm

In [2]:
text = "ChuyÃªn cÄn há» Masteri Tháº£o Äiá»n giÃ¡ ráº» nháº¥t thá» trÆ°á»ng, há» trá»£ vay 80%. LH0936 721 ***Mr. HoÃ i"
# fix text 
def fix_text(text):
    # Remove accents
    text = unicodedata.normalize('NFD', text)
    text = text.encode('ascii', 'ignore').decode('utf-8')
    return text
fix_text(text)

'ChuyAn cAn ha Masteri Thao Aian giA ra nhat tha trang, ha tra vay 80%. LH0936 721 ***Mr. HoA i'

In [None]:
def fetch_and_save_data(start_page, end_page, output_file):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0',
        'Referer': 'https://www.google.com/',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
        'Connection': 'keep-alive',
        'Cache-Control': 'no-cache',
        'Pragma': 'no-cache',
        'Accept-Language': 'vi-VN,vi;q=0.9'
    }
    data_list = []

    session = requests.Session()

    # Adding a retry mechanism to handle transient errors
    retry = Retry(connect=3, backoff_factor=0.5)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    proxies = None  # Set to None to disable proxy

    for page in tqdm(range(start_page, end_page + 1), desc="Fetching pages"):
        url = f"https://batdongsan.com.vn/ban-can-ho-chung-cu/p{page}"
        response = session.get(url, headers=headers, proxies=proxies)
        print(f"Fetching page {page}: Status code {response.status_code}")
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            link_sample = soup.find_all('a', class_='js__product-link-for-product-id')
            for link in link_sample:
                href = link.get('href')
                if href:
                    sub_url = f"https://batdongsan.com.vn{href}"
                    sample_res = session.get(sub_url, headers=headers, proxies=proxies)
                    sample = BeautifulSoup(sample_res.text, 'html.parser')
                    try:
                        title = sample.find('h1', class_='re__pr-title pr-title js__pr-title')
                        if title:
                            title = title.get_text(strip=True)
                            title = title.encode(sample_res.encoding).decode('utf-8', errors='ignore')
                        else:
                            title = None

                        description = sample.find('div', class_='re__section-body re__detail-content js__section-body js__pr-description js__tracking')
                        if description:
                            description = description.get_text(strip=True)
                            description = description.encode(sample_res.encoding).decode('utf-8', errors='ignore')
                        else:
                            description = None
                        
                        location = sample.find('span', class_='re__pr-short-description js__pr-address')
                        if location:
                            location = location.get_text(strip=True)
                            location = location.encode(sample_res.encoding).decode('utf-8', errors='ignore')
                        else:
                            location = None
                        
                        img_source = sample.find('div', class_='re__media-thumbs js__media-thumbs')

                        images = []
                        if img_source:
                            img_html = str(img_source)
                            # Use re.findall to extract all data-src links
                            image_links = re.findall(r'data-src="(.*?)"', img_html)
                            for image in image_links:
                                image = image.encode(sample_res.encoding).decode('utf-8', errors='ignore')
                                images.append(image)
                            
                        data = {
                            'title': title,    
                            'description': description,
                            'price': None,
                            'area': None,
                            'bedrooms': None,
                            'bathrooms': None,
                            'direction': None,
                            'bal_direction': None,
                            'legal': None,
                            'furniture': None,
                            'location': location,
                            'link': sub_url,
                            'image': images
                        }
                        
                        feature = sample.find_all('div', class_='re__pr-specs-content-item')
                        if feature:
                            for item in feature:
                                if item:
                                    check = item.find('span', class_='re__pr-specs-content-item-title')
                                    check = check.get_text(strip=True).encode(sample_res.encoding).decode('utf-8', errors='ignore')
                                    if "Mức giá" in check:
                                        item = item.find('span', class_='re__pr-specs-content-item-value')
                                        price = item.get_text(strip=True)
                                        data['price'] = price
                                    elif "Diện tích" in check:
                                        item = item.find('span', class_='re__pr-specs-content-item-value')
                                        area = re.sub(r'[^\d,]', '', str(item))
                                        data['area'] = area
                                    elif "Số phòng ngủ" in check:
                                        item = item.find('span', class_='re__pr-specs-content-item-value')
                                        bedrooms = re.sub(r'[^\d,]', '', str(item))
                                        data['bedrooms'] = bedrooms
                                    elif "Số phòng tắm, vệ sinh" in check:
                                        item = item.find('span', class_='re__pr-specs-content-item-value')
                                        bathrooms = re.sub(r'[^\d,]', '', str(item))
                                        data['bathrooms'] = bathrooms
                                    elif "Hướng nh" in check:
                                        item = item.find('span', class_='re__pr-specs-content-item-value')
                                        direction = re.sub(r'<.*?>', '', str(item))
                                        data['direction'] = direction
                                    elif "Hướng ban công" in check:
                                        item = item.find('span', class_='re__pr-specs-content-item-value')
                                        bal_direction = re.sub(r'<.*?>', '', str(item))
                                        data['bal_direction'] = bal_direction
                                    elif "Pháp lý" in check:
                                        item = item.find('span', class_='re__pr-specs-content-item-value')
                                        legal = (lambda x: "Hợp đồng mua bán" if ("Sổ hồng lâu dài" if ("lâu dài" in x or "Sổ hồng lâu dài" in x) else "Hợp đồng" in x or "Hợp đồng mua bán" in x) else "Sổ đỏ/ Sổ hồng" if ("Sổ đỏ" in x or "Sổ hồng" in x) else None)(item.get_text(strip=True))
                                        data['legal'] = legal
                                    elif "Nội thất" in check:
                                        item = item.find('span', class_='re__pr-specs-content-item-value')
                                        furniture = (lambda x: "Cao cấp" if "Cao cấp" in x else "Đầy đủ" if "Đầy đủ" in x else "Cơ bản" if "Cơ bản" in x else None)(item)
                                        data['furniture'] = furniture

                        data_list.append(data)
                    except AttributeError:
                        print("An element is missing some fields, skipping...")
        else:
            print(f"Failed to retrieve page {page}")
    
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(data_list, file, ensure_ascii=False, indent=4)

fetch_and_save_data(12, 16, 'data.json')

Fetching pages:   0%|          | 0/5 [00:00<?, ?it/s]

Fetching page 12: Status code 200


Fetching pages:  60%|██████    | 3/5 [00:03<00:01,  1.08it/s]

Fetching page 13: Status code 403
Failed to retrieve page 13
Fetching page 14: Status code 403
Failed to retrieve page 14


Fetching pages:  80%|████████  | 4/5 [00:04<00:00,  1.57it/s]

Fetching page 15: Status code 403
Failed to retrieve page 15
Fetching page 16: Status code 200


Fetching pages: 100%|██████████| 5/5 [00:11<00:00,  2.24s/it]


In [44]:
# Đoạn văn bản HTML
text = '<span class="re__pr-specs-content-item-value">24,5 tỷ</span>'

# Sử dụng re.sub để loại bỏ thẻ HTML
content = re.sub(r'<.*?>', '', text)

print(content)

24,5 tỷ


In [12]:
headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0',
	'Accept-Language': 'en-US,en;q=0.9',
	'Referer': 'https://www.google.com/',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
	'Connection': 'keep-alive',
	'Cache-Control': 'no-cache',
	'Pragma': 'no-cache'
}
data_list = []

url = "https://batdongsan.com.vn/ban-can-ho-chung-cu"
session = requests.Session()

# Adding a retry mechanism to handle transient errors

retry_strategy = Retry(
	total=3,
	backoff_factor=1,
	status_forcelist=[403, 500, 502, 503, 504],
	allowed_methods=["HEAD", "GET", "OPTIONS"]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)
# Remove the proxy settings or replace with valid proxy details
proxies = None  # Set to None to disable proxy

response = session.get(url, headers=headers, proxies=proxies)
if response.status_code == 403:
	print("Access forbidden. Check headers or try using a different proxy.")
else:
	response.raise_for_status()

In [15]:
if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
soup


<!DOCTYPE html>

<html lang="vi">
<head>
<link href="https://staticfile.batdongsan.com.vn" rel="preconnect"/>
<link href="https://staticfile.batdongsan.com.vn" rel="dns-prefetch"/>
<link href="https://file4.batdongsan.com.vn" rel="preconnect"/>
<link href="https://file4.batdongsan.com.vn" rel="dns-prefetch"/>
<script src="https://staticfile.batdongsan.com.vn/js/Common/Global/filestatic.ver11.msvbds.SyncFunctions.min.js" type="text/javascript"></script>
<script>
            window.unlFeatures = [{"f":"cplus-9046_new-pa-entry-point","v":"cplus-9046_new-pa-entry-point_version-c","s":"1"},{"f":"cplus-8015_implement-ui-message","v":"cplus-8015_implement-ui-message_version-a","s":"1"},{"f":"cplus-8842_new-label-verified-listing","v":"disabled","s":"0"},{"f":"cplus-9084_unified-search-srp-map-and-list","v":"cplus-9084_unified-search-srp-map-and-list_enabled","s":"1"},{"f":"BDSMON2-PIVOT-CONSUMER","v":"","s":"1"},{"f":"cplus-8431-braze-web-push","v":"","s":"1"},{"f":"cplus-8350_new-categories