In [1]:
import numpy as np
from selenium import webdriver
from time import sleep
import random
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException
from selenium.webdriver.common.by import By
import pandas as pd
import os

# Folder path
folder_path = "D:/HOC KI 8/3. Graduate project/hasaki_crawling"

# Add user agen
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"

# Setting options
options = webdriver.ChromeOptions()
options.add_argument(f"user-agent={user_agent}")
options.add_argument("--ignore-certificate-errors")
options.add_argument("--start-maximized")
options.add_argument("--disable-popup-blocking")
options.add_argument("--no-sandbox")

# Declare browser
driver = webdriver.Chrome(options=options)
sleep(random.randint(1,5))

In [3]:
# Read ids from crawled csv
productdata_filename = "productdata_20240318_2350.csv"

existing_df = pd.read_csv(os.path.join(folder_path, "data", productdata_filename))
existing_product_ids = existing_df['product_id'].tolist()
existing_data_product_ids = existing_df['data_product_id'].tolist()
existing_links = existing_df['link_item'].tolist()

In [4]:
# Get rating for comments:
def get_star(string):
    start_index = string.find(':')
    end_index = string.find('%')
    return int(string[start_index+1:end_index]) / 20

# Process data-product-id
def get_unique_data_productids(nested_list):
    unique_ids = set()
    for sublist in nested_list:
        unique_ids.update(sublist.split(','))
    return [id for id in unique_ids]

# Parse data_product_id
def parse_data_product_id(data_product_id_str):
    # Split string by ","
    id_list = data_product_id_str.split(',')
    # Get unique
    set_list = set()
    set_list.update(id_list)
    # Convert each element in the list to an integer and return
    return [id_ for id_ in set_list]

# ============================GET INFOMATION OF ALL ITEMS
crawled_ids = set()
df_list = []
for i, row in existing_df[:6].iterrows():
    
    # Get product page
    name_comment, content_comment, product_variant, datetime_comment, rating_comment = [], [], [], [], []
    driver.get(row['link_item'])
    
    # Get data_product_id_list
    elems_data_productids_list = driver.find_elements(By.CSS_SELECTOR, '.attribute-option-item')
    uniq_data_productids_list = parse_data_product_id(",".join([elem.get_attribute('data-product-ids') for elem in elems_data_productids_list]))
    uniq_data_product_id_str = ",".join(uniq_data_productids_list)

    # Get comment_pagination_number
    elems_cmtpage_nums = driver.find_elements(By.CSS_SELECTOR, '.pagination_comment a')
    commentpage_nums = [int(elem.get_attribute('rel')) for elem in elems_cmtpage_nums
                    if elem.get_attribute('rel').isdigit()]
    max_cmtpage = (max(commentpage_nums) if commentpage_nums else 1)
    # Decide whether to crawl
    if not set(uniq_data_productids_list).intersection(crawled_ids):
        # Get comment details
        for page_num in range(1, 2):
            try:
                print("Crawl Page " + str(page_num))
                elems_name = driver.find_elements(By.CSS_SELECTOR , ".title_comment strong.txt_color_1")
                name_comment = [elem.text for elem in elems_name] + name_comment
                print(name_comment)

                elems_content = driver.find_elements(By.CSS_SELECTOR , ".item_comment .content_comment")
                content_comment = [elem.text for elem in elems_content] + content_comment
                
                elems_product_variant = driver.find_elements(By.CSS_SELECTOR , ".item_comment .txt_999")
                product_variant = [elem.text for elem in elems_product_variant] + product_variant
                
                elems_datetime = driver.find_elements(By.CSS_SELECTOR , ".item_comment .timer_comment")
                datetime_comment = [elem.text for elem in elems_datetime] + datetime_comment

                elems_rating = driver.find_elements(By.CSS_SELECTOR , ".item_comment .number_start")
                rating_comment = [get_star(elem.get_attribute('style')) for elem in elems_rating] + rating_comment

                next_pagination_cmt = driver.find_element(By.CSS_SELECTOR, "a.item_next_sort .icon_carret_down")
                next_pagination_cmt.click()

                print("Clicked on button next page!")
                sleep(random.randint(5,7))

            except ElementNotInteractableException:
                print("Element Not Interactable Exception!")
                break

        # Add into a dataframe
        comment_data = pd.DataFrame(
            list(zip(name_comment, content_comment, product_variant, datetime_comment, rating_comment)), 
            columns = ['name_comment', 'content_comment','product_variant', 'datetime_comment', 'rating'])
        
        # Add column "link_item", "data_product_id_list", "data_product_id"
        comment_data.insert(0, "link_item", row['link_item'])
        comment_data.insert(1, "data_product_id_list", uniq_data_product_id_str)
        comment_data.insert(2, "data_product_id", row['data_product_id'])
        
        # For "data_product_id_list", convert string into list
        comment_data['data_product_id_list'] = comment_data['data_product_id_list'].apply(parse_data_product_id)
        df_list.append(comment_data)

        crawled_ids.update(uniq_data_productids_list)
        sleep(random.randint(7,9))
    else:
        continue

Crawl Page 1
['Thu Hương', 'Lưu văn Quân', 'lê thị như quỳnh', 'Vy Thao', 'Nguyễn Hoa', 'Mỹ Mỹ Trịnh', 'Trần Thị Thu Thuỷ', 'Vu thi Nga', 'Uyen Le', 'Uyen Le']
Clicked on button next page!
Crawl Page 1
['Ngọc Lý', 'Ngô thị ngư ý', 'Hoài anh', 'trinh nguyễn', 'Phan Hải', 'VAnh', 'Tú Vân', 'Kim An', 'Linh Linh', 'Xuân Trang']
Clicked on button next page!
Crawl Page 1
['TRINH NGUYỄN THỊ THÙY', 'nguyen hong ngoc thuy', 'NiNi', 'Nguyễn Tường An', 'Trần Thu Cần', 'Hồ Thị Hiền', 'Nguyễn thị thủy', 'Nguyễn Thị Thanh Thi', 'Mỹ Kim', 'Mani Ho']
Clicked on button next page!
Crawl Page 1
['Sô Thu', 'La Minh Dũng', 'KIM YẾN', 'Thi', 'minh ngọc', 'CHỊ THẢO', 'Hạ uyên', 'Yến vy', 'Nguyễn Ngọc Cát Tường', 'Uyen Le']
Clicked on button next page!
Crawl Page 1
['Phùng Tạ Linh', 'Ngoc Anh', 'Hoaithu25071993', 'Nguyễn Cát Tường', 'Hảo Phan', 'Vũ thu duyên', 'Nguyễn Thị Cẩm Dung', 'Lê Thuý Hoài', 'Minh Trang', 'Mai Đặng']
Clicked on button next page!


In [5]:
# Combine all comment crawled
combined_comment_data = pd.concat(df_list, ignore_index=True)
combined_comment_data.head(5)

Unnamed: 0,link_item,data_product_id_list,data_product_id,name_comment,content_comment,product_variant,datetime_comment,rating
0,https://hasaki.vn/san-pham/nuoc-tay-trang-tuoi...,"[99707, 102557, 5294, 19286, 19325, 95711, 529...",19325,Thu Hương,"mình siêu ưng em này luôn, đợt đc tặng sn mà g...","Nước Tẩy Trang L'Oreal Dưỡng Ẩm Cho Da Thường,...",00: 15 | 08/03/2024,5.0
1,https://hasaki.vn/san-pham/nuoc-tay-trang-tuoi...,"[99707, 102557, 5294, 19286, 19325, 95711, 529...",19325,Lưu văn Quân,tuyệt lắm nha,"Nước Tẩy Trang L'Oreal Tươi Mát Cho Da Dầu, Hỗ...",16: 05 | 06/03/2024,5.0
2,https://hasaki.vn/san-pham/nuoc-tay-trang-tuoi...,"[99707, 102557, 5294, 19286, 19325, 95711, 529...",19325,lê thị như quỳnh,"một từ thôi ""tuyệt""ko thắc mắc gì hết","Nước Tẩy Trang L'Oreal Tươi Mát Cho Da Dầu, Hỗ...",21: 44 | 05/03/2024,5.0
3,https://hasaki.vn/san-pham/nuoc-tay-trang-tuoi...,"[99707, 102557, 5294, 19286, 19325, 95711, 529...",19325,Vy Thao,"Đây là nước tẩy trang mà tui cảm giác ổn nhất,...","Nước Tẩy Trang L'Oreal Tươi Mát Cho Da Dầu, Hỗ...",11: 28 | 02/01/2024,5.0
4,https://hasaki.vn/san-pham/nuoc-tay-trang-tuoi...,"[99707, 102557, 5294, 19286, 19325, 95711, 529...",19325,Nguyễn Hoa,Thương hiệu xứng đáng điểm 10,"Nước Tẩy Trang L'Oreal Tươi Mát Cho Da Dầu, Hỗ...",11: 04 | 05/12/2023,5.0


In [6]:
# Save into csv
from datetime import datetime
current_datetime = datetime.now().strftime("%Y%m%d_%H%M")
comment_data_file_name = f"comment_data_{current_datetime}.csv"
combined_comment_data.to_csv(os.path.join(folder_path, "data", comment_data_file_name), encoding='utf-8-sig')