In [1]:
from google import genai
from pydantic import BaseModel
import json
import asyncio
from tqdm.asyncio import tqdm
from collections import defaultdict
import pandas as pd
from typing import List, Dict


In [2]:
import os
from dotenv import load_dotenv
load_dotenv()
api_key = os.getenv("GOOGLE_GENAI_API_KEY")

In [3]:
def group_by_hs_prefix(df):
    grouped = defaultdict(list)
    for _, row in df.iterrows():
        mahs_value = row['mahs']
        prefix = str(mahs_value)[:4]  # Chuy·ªÉn sang chu·ªói v√† l·∫•y 4 k√Ω t·ª± ƒë·∫ßu
        grouped[prefix].append(row.to_dict())  # Chuy·ªÉn h√†ng th√†nh t·ª´ ƒëi·ªÉn
    return grouped


In [4]:

class QAItem(BaseModel):
    question: str
    answer: str

class QAList(BaseModel):
    items: list[QAItem]

In [5]:
from typing import List, Dict

def build_prompt_for_group(prefix: str, items: List[Dict]) -> str:
    items_list = "\n".join(
        f"- {item['mahs']}: {item['mo_ta']}" for item in items
    )
    num_questions = 5 * len(items)  # t√≠nh s·ªë c√¢u h·ªèi theo y√™u c·∫ßu

    prompt = f"""B·∫°n l√† chuy√™n gia ph√¢n lo·∫°i m√£ HS Code. D∆∞·ªõi ƒë√¢y l√† danh s√°ch c√°c m√£ HS thu·ªôc nh√≥m {prefix} c√πng m√¥ t·∫£ chi ti·∫øt:

{items_list}

---
**M·ª•c ti√™u:** T·∫°o ra **{num_questions} c·∫∑p c√¢u h·ªèi v√† c√¢u tr·∫£ l·ªùi t·ª± nhi√™n**, t∆∞∆°ng ƒë∆∞∆°ng **5 c·∫∑p cho m·ªói m√£ HS** trong nh√≥m.

---

‚úÖ **Y√™u c·∫ßu c·ª• th·ªÉ**:
1. **Ng√¥n ng·ªØ ƒë·ªùi th∆∞·ªùng**, t·ª± nhi√™n, th√¢n thi·ªán nh∆∞ trong h·ªôi tho·∫°i h·∫±ng ng√†y.
2. C√¢u h·ªèi mang t√≠nh **th·ª±c ti·ªÖn**, v√≠ d·ª•:
   - ‚ÄúT√¥i ƒëang chu·∫©n b·ªã nh·∫≠p kh·∫©u con X, th√¨ d√πng m√£ n√†o?‚Äù
   - ‚ÄúHai m√£ n√†y kh√°c nhau ·ªü ch·ªó n√†o khi l√†m th·ªß t·ª•c?‚Äù
   - ‚ÄúLo·∫°i n√†o √°p d·ª•ng cho h√†ng gi·ªëng v√† lo·∫°i n√†o cho h√†ng kh√¥ng gi·ªëng?‚Äù
3. T·∫≠p trung v√†o **so s√°nh, ph√¢n bi·ªát, h∆∞·ªõng d·∫´n ch·ªçn m√£** gi·ªØa c√°c m√£ HS trong c√πng nh√≥m.
4. N·ªôi dung ph·∫£i **ng·∫Øn g·ªçn nh∆∞ng d·ªÖ hi·ªÉu**, ph√π h·ª£p v·ªõi ng∆∞·ªùi kh√¥ng chuy√™n v·ªÅ h·∫£i quan ho·∫∑c HS code.
5. Tr√°nh thu·∫≠t ng·ªØ chuy√™n ng√†nh kh√≥ hi·ªÉu; thay v√†o ƒë√≥, d√πng v√≠ d·ª• c·ª• th·ªÉ, t√¨nh hu·ªëng nh·∫≠p h√†ng, kinh doanh, k√™ khai th·ª±c t·∫ø.

---

üì¶ **ƒê·∫ßu ra mong mu·ªën**: Tr·∫£ v·ªÅ ƒë√∫ng ƒë·ªãnh d·∫°ng JSON sau:

{{
  "items": [
    {{
      "question": "S·ª± kh√°c bi·ªát gi·ªØa m√£ 01013010 v√† 01013090 l√† g√¨?",
      "answer": "01013010 d√πng cho l·ª´a thu·∫ßn ch·ªßng ƒë·ªÉ nh√¢n gi·ªëng, c√≤n 01013090 l√† c√°c lo·∫°i l·ª´a kh√°c kh√¥ng d√πng nh√¢n gi·ªëng."
    }},
    ...
  ]
}}

L∆∞u √Ω: Tr·∫£ v·ªÅ **ch·ªâ JSON**, kh√¥ng c√≥ gi·∫£i th√≠ch ho·∫∑c m√¥ t·∫£ n√†o th√™m.
"""

    return prompt


In [6]:
import json

async def fetch_hscode_qa_from_csv(grouped_data: dict, model="gemini-2.0-flash-001"):
    all_qas = []  # List gom t·∫•t c·∫£ c√¢u h·ªèi-ƒë√°p t·ª´ c√°c prefix

    client = genai.Client(api_key=api_key)

    for prefix, items in grouped_data.items():
        content_data = build_prompt_for_group(prefix, items)

        response = client.models.generate_content(
            model=model,
            contents=content_data,
            config={
                "response_mime_type": "application/json",
                "response_schema": QAList
            },
        )

        # Parse response.text JSON ra dict r·ªìi l·∫•y ph·∫ßn "items"
        data = json.loads(response.text)

        if "items" in data:
            all_qas.extend(data["items"])  # Th√™m v√†o list chung

    return all_qas


In [7]:
import asyncio
data_path = "/home/vinh/HS Code/Data/new_mota.csv"
df = pd.read_csv(data_path, dtype=str)
grouped = group_by_hs_prefix(df)


result = await fetch_hscode_qa_from_csv(grouped,"gemini-2.0-flash-001")
print(result)




[{'question': 'N·∫øu t√¥i nh·∫≠p ng·ª±a kh√¥ng ph·∫£i ƒë·ªÉ nh√¢n gi·ªëng th√¨ d√πng m√£ 01012900 ƒë√∫ng kh√¥ng?', 'answer': 'ƒê√∫ng v·∫≠y, m√£ 01012900 d√†nh cho ng·ª±a s·ªëng kh√¥ng d√πng ƒë·ªÉ nh√¢n gi·ªëng.'}, {'question': 'M√£ 01012900 √°p d·ª•ng cho lo·∫°i ng·ª±a n√†o?', 'answer': 'M√£ n√†y d√πng cho c√°c lo·∫°i ng·ª±a s·ªëng kh√°c, tr·ª´ ng·ª±a thu·∫ßn ch·ªßng d√πng ƒë·ªÉ nh√¢n gi·ªëng.'}, {'question': 'Ng·ª±a ƒëua nh·∫≠p v·ªÅ th√¨ c√≥ d√πng m√£ 01012900 ƒë∆∞·ª£c kh√¥ng?', 'answer': 'C√≥, n·∫øu ng·ª±a ƒëua kh√¥ng ph·∫£i l√† ng·ª±a thu·∫ßn ch·ªßng ƒë·ªÉ nh√¢n gi·ªëng th√¨ d√πng m√£ 01012900.'}, {'question': 'S·ª± kh√°c bi·ªát ch√≠nh gi·ªØa ng·ª±a thu·ªôc m√£ 01012900 v√† c√°c m√£ kh√°c trong nh√≥m 0101 l√† g√¨?', 'answer': '01012900 d√†nh cho ng·ª±a kh√¥ng d√πng v√†o m·ª•c ƒë√≠ch nh√¢n gi·ªëng, c√≤n c√°c m√£ kh√°c c√≥ th·ªÉ d√†nh cho ng·ª±a gi·ªëng ho·∫∑c c√°c lo√†i kh√°c nh∆∞ l·ª´a, la.'}, {'question': 'Khi n√†o th√¨ kh√¥ng d√πng m√£ 01012900?', 'answer': 'Khi nh·∫≠p ng·ª±a thu·

In [8]:
# L∆∞u JSON
with open("/home/vinh/HS Code/output_QA/hs_code_qa_demo.json", "w", encoding="utf-8") as f_json:
    json.dump(result, f_json, ensure_ascii=False, indent=2)