# Import libraries

In [1]:
# from utils import *
import os
import requests
from bs4 import BeautifulSoup, Tag
import json
import time

from langchain_chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings

## Crawl general information

In [None]:
def get_names(url):
    return url.split(".vn/")[1].split("/")[0]

def crawl_intro(url, output_file):
    name = get_names(url)
    print(name)
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
    except Exception as e:
        print(f"[x] Request failed for {url}: {e}")
        return None

    soup = BeautifulSoup(response.text, "html.parser")

    class_names = ["intro-content","intro-stats", "heading-desc", "footer-container"]
    parts = []

    for class_name in class_names:
        elements = soup.find_all(class_=class_name)
        for el in elements:
            text = el.get_text(separator="\n", strip=True)
            if text:
                parts.append(text)

    if not parts:
        print(f"[!] No target classes found for {url}")
        return None

    raw_text = "\n".join(parts)

    result = [ {
        "url": url,
        "raw_text": raw_text
    }]
    filename = name + ".json"    
    output_file = os.path.join("data/general_info", filename)
    with open("ve-chung-toi.json", "w", encoding="utf-8") as f:
        json.dump(result, f, ensure_ascii=False, indent=2)

# os.makedirs("data/general_info", exist_ok=True)
# crawl_intro("https://vienthammydiva.vn/ve-chung-toi/")

In [None]:
url = 'https://vienthammydiva.vn/lien-he'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

raw_info = soup.find("ul", class_="contact-list")

info = raw_info.find_all("li")

content = "\n".join([piece.get_text() for piece in info])


json_format = [{"url": 'https://vienthammydiva.vn/lien-he',
                "content": content}]

with open("contact.json", "w", encoding="utf-8") as f:
    json.dump(json_format, f, ensure_ascii=False, indent=2)



## Crawl subpage that has multiple sublinks

### Find sublinks

In [1]:
def get_names(url):
    return url.split(".vn/")[1].split("/")[0]

def get_tab_links(url, class_name, page_num=1):
    links = []
    for i in range(1, page_num+1):
        new_url = url +f"page/{i}/"
        res = requests.get(new_url)
        if res.status_code != 200:
            print("Lỗi khi truy cập trang:", url)
            return []

        soup = BeautifulSoup(res.text, "html.parser")

        # Tìm div có class "tab-card-content active"
        tab_div = soup.find("div", class_=class_name)
        if not tab_div:
            print(f"Không tìm thấy {class_name}")
            return []

        # Tìm tất cả thẻ <a> bên trong vùng đó
        
        for a in tab_div.find_all("a", href=True):
            links.append(a["href"])

    return links

def crawl_post_title_and_content(post_url):
    """Lấy tiêu đề và nội dung bài viết"""
    print(f"    [-] Crawling post: {post_url}")
    res = requests.get(post_url)
    soup = BeautifulSoup(res.text, "html.parser")

    title_tag = soup.find("h1", class_="post-title")
    title = title_tag.get_text().strip()
    content_div = soup.find("div", class_="mc-content-post-single")
    if not content_div:
        print(f"[!] Không tìm thấy nội dung trong: {post_url}")
        return None

    tag_names = [tag.name for tag in content_div.find_all()]
    # Gộp các đoạn văn thành 1 string
    paragraphs = content_div.find_all(["p", "h2", "li", "strong"])
    content  = content_div.get_text(separator="\n", strip=True)
    print(type(content))

    parts = []
    return {
        "url": post_url,
        "title": title,
        "content": content
    }

def crawl_category(category_url, output_file, class_name, page_num):

    results = []
    post_links = get_tab_links(category_url, class_name, page_num)
    
    for url in post_links:
        try:
            data = crawl_post_title_and_content(url)
            if data:
                results.append(data)
        except Exception as e:
            print(f"    [x] Error crawling {url}: {e}")
        time.sleep(1)

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    print(f"[✓] Done! Saved {len(results)} articles to {output_file}")





### Find the last page numbers

In [2]:
def get_last_page_number(category_url):
    # 1. Tải HTML của trang danh mục
    res = requests.get(category_url, timeout=10)
    res.raise_for_status()
    soup = BeautifulSoup(res.text, "html.parser")

    anchors = soup.select("div.nav-links a.page-numbers")
    page_nums = []
    for a in anchors:
        txt = a.get_text(strip=True)
        if txt.isdigit():
            page_nums.append(int(txt))

    if not page_nums:
        return 1  
    return max(page_nums)



### Get beauty services information

In [None]:
service_sublinks = get_tab_links("https://vienthammydiva.vn/dich-vu-tham-my/", "tabs")
os.makedirs("diva_data/services", exist_ok=True)
for link in service_sublinks:
    print(link)
    name = get_names(link)
    filename = name + ".json"    
    output_file = os.path.join("diva_data/services", filename)
    page_num = get_last_page_number(link)
    print(page_num)
    crawl_category(link, output_file, "tab-card-content active", page_num)
    

https://vienthammydiva.vn/cham-soc-da/
1
    [-] Crawling post: https://vienthammydiva.vn/se-khit-lo-chan-long/
<class 'str'>
    [-] Crawling post: https://vienthammydiva.vn/phu-bong-nano-cho-da/
<class 'str'>
    [-] Crawling post: https://vienthammydiva.vn/hap-trang-nano/
<class 'str'>
    [-] Crawling post: https://vienthammydiva.vn/tam-trang-glubel/
<class 'str'>
    [-] Crawling post: https://vienthammydiva.vn/dien-di-tinh-chat/
<class 'str'>
    [-] Crawling post: https://vienthammydiva.vn/vi-kim-tao-bien/
<class 'str'>
[✓] Done! Saved 6 articles to diva_data/services\cham-soc-da.json
https://vienthammydiva.vn/dieu-tri-da/
1
    [-] Crawling post: https://vienthammydiva.vn/dieu-tri-seo-ro/
<class 'str'>
[✓] Done! Saved 1 articles to diva_data/services\dieu-tri-da.json
https://vienthammydiva.vn/phau-thuat-tham-my/
1
Không tìm thấy tab-card-content active
[✓] Done! Saved 0 articles to diva_data/services\phau-thuat-tham-my.json
https://vienthammydiva.vn/phun-xam-tham-my/
8
    [-] 

### Get beauty knowledge information

In [None]:
service_sublinks = get_tab_links("https://vienthammydiva.vn/kien-thuc-lam-dep/", "tabs")
os.makedirs("diva_data/beauty_knowledge", exist_ok=True)
for link in service_sublinks[0:-1]:
    print(link)
    name = get_names(link)
    filename = name + ".json"    
    output_file = os.path.join("diva_data/beauty_knowledge", filename)
    page_num = get_last_page_number(link)
    print(page_num)
    crawl_category(link, output_file, "tab-card-content active", page_num)

https://vienthammydiva.vn/cham-soc-toc/
1
    [-] Crawling post: https://vienthammydiva.vn/dan-ong-toc-xoan/
<class 'str'>
[✓] Done! Saved 1 articles to diva_data/beauty_knowledge\cham-soc-toc.json
https://vienthammydiva.vn/giam-can-kien-thuc-lam-dep/
2
    [-] Crawling post: https://vienthammydiva.vn/tep-bao-nhieu-calo/
<class 'str'>
    [-] Crawling post: https://vienthammydiva.vn/he-bao-nhieu-calo/
<class 'str'>
    [-] Crawling post: https://vienthammydiva.vn/tre-tron-bao-nhieu-calo/
<class 'str'>
    [-] Crawling post: https://vienthammydiva.vn/mi-siukay-bao-nhieu-calo/
<class 'str'>
    [-] Crawling post: https://vienthammydiva.vn/takoyaki-bao-nhieu-calo/
<class 'str'>
    [-] Crawling post: https://vienthammydiva.vn/doi-lon-luoc-bao-nhieu-calo/
<class 'str'>
    [-] Crawling post: https://vienthammydiva.vn/suong-sam-bao-nhieu-calo/
<class 'str'>
    [-] Crawling post: https://vienthammydiva.vn/banh-cream-o-bao-nhieu-calo/
<class 'str'>
    [-] Crawling post: https://vienthammydi

### Get past news events information

In [None]:
os.makedirs("diva_data/news_event", exist_ok=True)
link = "https://vienthammydiva.vn/tin-tuc-su-kien/"
name = get_names(link)
filename = name + ".json"    
output_file = os.path.join("diva_data/news_event", filename)
page_num = get_last_page_number(link)
print(page_num)
crawl_category(link, output_file, "tab-card-content active", page_num)

35
    [-] Crawling post: https://vienthammydiva.vn/vien-tham-my-diva-uu-dai-mung-dai-le-tro-gia-toi-50-phi-dich-vu/
<class 'str'>
    [-] Crawling post: https://vienthammydiva.vn/vien-tham-my-diva-ky-ket-chuyen-giao-cong-nghe-adva-karisma-everqueen-buoc-tien-moi-trong-nganh-tham-my-tai-tao-da/
<class 'str'>
    [-] Crawling post: https://vienthammydiva.vn/tuong-ban-chan-kho/
<class 'str'>
    [-] Crawling post: https://vienthammydiva.vn/vien-tham-my-diva-khoi-dong-dai-hoi-ve-dep-vinh-cuu-giam-soc-toi-90-chi-phi/
<class 'str'>
    [-] Crawling post: https://vienthammydiva.vn/nhung-khoanh-khac-dang-nho-cua-phai-nu-trong-ngay-8-3-tai-vien-tham-my-diva/
<class 'str'>
    [-] Crawling post: https://vienthammydiva.vn/cung-diva-cham-tay-ve-dep-vinh-cuu/
<class 'str'>
    [-] Crawling post: https://vienthammydiva.vn/talkshow-ung-dung-cong-nghe-hieu-qua-trong-linh-vuc-lam-dep/
<class 'str'>
    [-] Crawling post: https://vienthammydiva.vn/vien-tham-my-diva-to-chuc-mua-lan-su-rong-khai-xuan-at-

### Get information from each facility

In [None]:
os.makedirs("diva_data/facility", exist_ok=True)
link = "https://vienthammydiva.vn/lien-he/"
name = get_names(link)
filename = name + ".json"    
output_file = os.path.join("diva_data/facility", filename)
page_num = get_last_page_number(link)
print(page_num)
crawl_category(link, output_file, "branch-grid", 8)

1
    [-] Crawling post: https://vienthammydiva.vn/chuoi-co-so/vien-tham-my-diva-phuoc-buu/
<class 'str'>
    [-] Crawling post: https://vienthammydiva.vn/chuoi-co-so/vien-tham-my-diva-hon-dat/
<class 'str'>
    [-] Crawling post: https://vienthammydiva.vn/chuoi-co-so/vien-tham-my-diva-phu-ha-phan-rang/
<class 'str'>
    [-] Crawling post: https://vienthammydiva.vn/chuoi-co-so/vien-tham-my-diva-tan-phuoc-khanh/
<class 'str'>
    [-] Crawling post: https://vienthammydiva.vn/chuoi-co-so/vien-tham-my-diva-thong-nhat/
<class 'str'>
    [-] Crawling post: https://vienthammydiva.vn/chuoi-co-so/vien-tham-my-diva-tan-binh/
<class 'str'>
    [-] Crawling post: https://vienthammydiva.vn/chuoi-co-so/vien-tham-my-diva-binh-long/
<class 'str'>
    [-] Crawling post: https://vienthammydiva.vn/chuoi-co-so/vien-tham-my-diva-phu-giao/
<class 'str'>
    [-] Crawling post: https://vienthammydiva.vn/chuoi-co-so/vien-tham-my-diva-my-long/
<class 'str'>
    [-] Crawling post: https://vienthammydiva.vn/chuoi

In [None]:
os.makedirs("diva_data/human_resources", exist_ok=True)
url = "https://vienthammydiva.vn/doi-ngu-chuyen-gia/"

response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

def get_soup_info(soup):
    staff_list = []
    title = soup.find(class_="section-title").get_text()
    cards = soup.find_all("div", class_="slider-card")
    for card in cards:
        name_tag = card.find(class_="slider-card-title")
        position_tag = card.find(class_="slider-card-subtitle")
        exp_tag = card.find(class_="slider-card-desc")

        
        name =  name_tag.get_text(strip=True) if name_tag else ""
        position =  position_tag.get_text(strip=True) if position_tag else ""
        exp = exp_tag.get_text(strip=True) if exp_tag else ""
        

        staff_info = f"Tên: {name}, vị trí: {position}, kinh nghiệm: {exp}"

        staff_list.append(staff_info)

    content =  "\n".join(staff_list)
    result.append({
            "title": title,
            "content": content
    })


doctor_section = soup.find("section", class_="slider-section mc-experts", id="doctor-section")

manager_section = doctor_section.find_next("section", class_="slider-section mc-experts")
expert_section = soup.find("section", class_="expert-section")

# titles = soup.find_all(class_="section-title")

result = []
doctor_info = get_soup_info(doctor_section)
manager_info = get_soup_info(manager_section)
expert_info = get_soup_info(expert_section)

output_file = os.path.join("diva_data/human_resources", "human.json")
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(result, f, ensure_ascii=False, indent=2)
