In [1]:
from bs4 import BeautifulSoup as bs
import os
import sqlite3

def _parse_specs_detail(detail_specs_html_path: str) -> dict:
    output = {}
    try:
        with open(detail_specs_html_path, 'r', encoding='utf-8') as f:
            detail_specs_html = f.read()
        
        soup = bs(detail_specs_html, 'html.parser')
        
        modal_content = soup.find('div', {'class': 'c-modal__content'})
        
        rows = modal_content.find_all('div', {'class': 'c-modal__row'})
        
        for row in rows:
            st_table_title = row.find('div', {'class': 'st-table-title'}).text.strip()
            # print(f"==>> st_table_title: {st_table_title}")
            
            table = row.find('table')
            if table is None:
                continue
            
            ####################### Parse in detail #######################
            row_of_table = table.find_all('tr')
            
            if st_table_title == 'Bộ xử lý':
                for rows_in_table in row_of_table:
                    cols = rows_in_table.find_all('td')
                    # print(f"==>> cols: {cols}")
                    
                    prop = cols[0].text.strip()
                    value = cols[1].text.strip()
                    
                    if prop == 'Công nghệ CPU':
                        output['CPU brand modifier'] = value
                    elif prop == 'Loại CPU':
                        output['CPU generation'] = value
                    elif prop == 'Tốc độ tối đa':
                        output['CPU Speed (GHz)'] = value
                    elif prop == 'Hãng CPU':
                        output['CPU manufacturer'] = value
            elif st_table_title == 'RAM':
                for rows_in_table in row_of_table:
                    cols = rows_in_table.find_all('td')
                    # print(f"==>> cols: {cols}")
                    
                    prop = cols[0].text.strip()
                    value = cols[1].text.strip()
                    
                    if prop == 'Dung lượng RAM':
                        output['RAM (GB)'] = value
                    elif prop == 'Loại RAM':
                        output['RAM Type'] = value
                    elif prop == 'Tốc độ RAM':
                        output['Bus (MHz)'] = value
            elif st_table_title == 'Màn hình':
                for rows_in_table in row_of_table:
                    cols = rows_in_table.find_all('td')
                    # print(f"==>> cols: {cols}")
                    
                    prop = cols[0].text.strip()
                    value = cols[1].text.strip()
                    
                    if prop == 'Kích thước màn hình':
                        output['Screen Size (inch)'] = value
                    elif prop == 'Độ phân giải':
                        output['Screen Resolution'] = value
                    elif prop == 'Tần số quét':
                        output['Refresh Rate (Hz)'] = value
            elif st_table_title == 'Đồ họa':
                for rows_in_table in row_of_table:
                    cols = rows_in_table.find_all('td')
                    # print(f"==>> cols: {cols}")
                    
                    prop = cols[0].text.strip()
                    value = cols[1].text.strip()
                    
                    if prop == 'Hãng':
                        output['GPU manufacturer'] = value
            elif st_table_title == 'Thông tin pin & Sạc':
                for rows_in_table in row_of_table:
                    cols = rows_in_table.find_all('td')
                    # print(f"==>> cols: {cols}")
                    
                    prop = cols[0].text.strip()
                    value = cols[1].text.strip()
                    
                    if prop == 'Dung lượng pin':
                        output['Battery'] = value
                        
        return output
    except Exception as e:
        print(f"==>> Error: {e}")
        return None

def _parse_html(conn: sqlite3.Connection, raw_html_path: str, detail_specs_html_path: str) -> dict:
    output = {}

    try:
        print('Parsing HTML', raw_html_path)
        
        if not os.path.exists(raw_html_path) or not os.path.exists(detail_specs_html_path):
            return {
                'status': 'error',
                'message': 'File not found',
                'data': None
            }
        
        with open(raw_html_path, 'r', encoding='utf-8') as f:
            raw_html = f.read()
            
        raw_soup = bs(raw_html, 'html.parser')
        
        # Get the manufacturer
        manufacturer = conn.execute(f'''
                        select Manufacturer from fpt_fetch_results
                        where Raw_html_path = '{raw_html_path}'
                                    ''').fetchone()[0]
        output['Manufacturer'] = manufacturer

        # Get the price
        price = raw_soup.find('div', {'class': 'st-price-main'}).text.strip()
        output['Price (VND)'] = price
        
        # Go to summary specs
        thong_so_ky_thuat = raw_soup.find('div', {'class': 'card re-card st-card'})
        card_body = thong_so_ky_thuat.find('div', {'class': 'card-body'})
        tbody = card_body.find('tbody')
        
        for row in tbody.find_all('tr'):
            cols = row.find_all('td')
            
            prop = cols[0].text.strip()
            value = cols[1].text.strip()
            
            if prop == 'Trọng lượng':
                output['Weight (kg)'] = value
            if prop == 'Ổ cứng':
                output['Storage (GB)'] = value
        
        # Go to detail specs
        detail_specs = _parse_specs_detail(detail_specs_html_path)
        
        if detail_specs is not None:
            output.update(detail_specs)
            
        return output
    except Exception as e:
        return None

In [2]:
_parse_html(
    conn=sqlite3.connect('database/introds.db'),
    raw_html_path='data/fpt/raw_htmls/msi_36.html',
    detail_specs_html_path='data/fpt/detail_htmls/msi_36.html'
)

Parsing HTML data/fpt/raw_htmls/msi_36.html


{'Manufacturer': 'msi',
 'Price (VND)': '15.790.000₫',
 'Storage (GB)': 'SSD 512 GB',
 'Weight (kg)': '1.4 kg',
 'CPU manufacturer': 'Intel',
 'CPU brand modifier': 'Core i5',
 'CPU generation': '1335U',
 'CPU Speed (GHz)': '4.6 GHz',
 'RAM (GB)': '8 GB',
 'RAM Type': 'DDR4',
 'Bus (MHz)': '3200 MHz',
 'Screen Size (inch)': '14.0 inch',
 'Screen Resolution': '1920 x 1080 Pixels',
 'Refresh Rate (Hz)': '60 Hz',
 'GPU manufacturer': 'Intel'}

In [3]:
conn = sqlite3.connect('database/introds.db')

rows = conn.execute('select Raw_html_path, Detail_specs_html_path from fpt_fetch_results').fetchall()

res = []

err = []

for row in rows:
    raw_html_path = row[0]
    detail_specs_html_path = row[1]
    
    output = _parse_html(conn, raw_html_path, detail_specs_html_path)
    
    if output is None:
        err.append(raw_html_path)
        continue
    
    print(f"==>> output: {output}")
    
    res.append(output)

Parsing HTML data/fpt/raw_htmls/asus_3.html
==>> output: {'Manufacturer': 'asus', 'Price (VND)': '8.990.000₫', 'Storage (GB)': 'SSD 256 GB', 'Weight (kg)': '1.8 kg', 'CPU manufacturer': 'AMD', 'CPU brand modifier': 'Ryzen 3', 'CPU generation': '7320U', 'CPU Speed (GHz)': '4.1 GHz', 'RAM (GB)': '8 GB', 'RAM Type': 'LPDDR5', 'Bus (MHz)': '5500 MHz', 'Screen Size (inch)': '15.6 inch', 'Screen Resolution': '1920 x 1080 Pixels', 'Refresh Rate (Hz)': '60 Hz', 'GPU manufacturer': 'Intel', 'Battery': '3 Cell'}
Parsing HTML data/fpt/raw_htmls/asus_0.html
==>> output: {'Manufacturer': 'asus', 'Price (VND)': '17.990.000₫', 'Storage (GB)': 'SSD 512 GB', 'Weight (kg)': '2.3 kg', 'CPU manufacturer': 'Intel', 'CPU brand modifier': 'Core i5', 'CPU generation': '11400H', 'CPU Speed (GHz)': '4.5 GHz', 'RAM (GB)': '16 GB (2 thanh 8 GB)', 'RAM Type': 'DDR4', 'Bus (MHz)': '3200 MHz', 'Screen Size (inch)': '15.6 inch', 'Screen Resolution': '1920 x 1080 Pixels', 'Refresh Rate (Hz)': '144 Hz', 'GPU manufactur

In [4]:
import json

with open('data/fpt/parse_results.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(res, ensure_ascii=False, indent=4))

In [5]:
len(res)

207

In [3]:
import re 

abc = '16 GB (2 x 8 GB) DDR4 3200 MHz'

# -> 1920 x 1080

re.findall(r'\d+ GB', abc)

['16 GB', '8 GB']