In [None]:
# # Param
# test = "True"
# dataset = 'data'

In [2]:
is_test = True
if test == "False" or test == False:
    is_test = False

In [1]:
import os
print(os.getcwd())

/Users/dauduchieu/Desktop/iSE-CBM


In [2]:
is_test = True

In [4]:
from utils.data_loader import DataLoader

In [5]:
data_loader = DataLoader(dataset)

In [6]:
train_df = data_loader.get_data_train()

In [7]:
data_desc = data_loader.get_data_desc()
label_column = data_desc['label_column']
text_column = data_desc['text_column']

In [8]:
keyword_concepts = data_loader.get_keyword_concepts()
abstract_concepts = data_loader.get_abstract_concepts()

In [9]:
if is_test:
    train_df = train_df.groupby(label_column).sample(50)

In [10]:
from abc import ABC, abstractmethod
class LLMCaller(ABC):
    @abstractmethod
    def structed_output(self, prompt:str, output_struct):
        pass

from time import time, sleep
from typing import List, Callable, TypeVar

T = TypeVar('T')

class GeminiRateLimiter:
    def __init__(self, requests_per_minute: int = 15):
        self.rpm = requests_per_minute
        self.times: List[float] = []

    def wait(self):
        now = time()
        self.times = [t for t in self.times if now - t <= 60]
        if len(self.times) >= self.rpm:
            sleep(60 - (now - self.times[0]))
            now = time()
            self.times = [t for t in self.times if now - t <= 60]

    def record(self):
        self.times.append(time())

    def execute(self, f: Callable[..., T], *args, **kwargs) -> T:
        self.wait()
        try:
            r = f(*args, **kwargs)
            self.record()
            return r
        except:
            self.record()
            raise

    def __call__(self, f: Callable[..., T]) -> Callable[..., T]:
        def wrapper(*args, **kwargs):
            return self.execute(f, *args, **kwargs)
        return wrapper

    def __enter__(self):
        self.wait()
        return self

    def __exit__(self, *args):
        self.record()

from google import genai

class GeminiAPICaller(LLMCaller):
    def __init__(self, api_key:str, api_model:str, api_rpm:int):
        self.client = genai.Client(api_key=api_key)
        self.api_model = api_model
        self.rate_limiter = GeminiRateLimiter(requests_per_minute=api_rpm)

    def structed_output(self, prompt:str, output_struct):
        with self.rate_limiter:
            response = self.client.models.generate_content(
                model=self.api_model,
                contents=prompt,
                config={
                    'response_mime_type': 'application/json',
                    'response_schema': output_struct,
                },
            )

        res = response.parsed
        return res
    
llm_api_config = data_loader.get_llm_config()

llm_caller = GeminiAPICaller(
    api_key=llm_api_config['api_key'],
    api_model=llm_api_config['model'],
    api_rpm=llm_api_config['rate_per_minute']
)

In [11]:
import pandas as pd
import spacy
from tqdm import tqdm

In [12]:
nlp = spacy.load("en_core_web_sm")

In [13]:
tqdm.pandas(desc="Lemmatizing")
def lemmatize_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc])
train_df['text_lemma'] = train_df[text_column].progress_apply(lemmatize_text)

Lemmatizing: 100%|██████████| 250/250 [00:07<00:00, 32.72it/s]


In [14]:
keywords = []
for k in keyword_concepts.keys():
    keywords += keyword_concepts[k]

print(keywords)

['coronary', 'myocardial', 'hypertension', 'cardiac', 'systolic', 'colitis', 'esophageal', 'gastrointestinal', 'bowel', 'duodenal', 'defect', 'loss', 'airway', 'graft', 'respiratory', 'cancer', 'carcinoma', 'sarcoma', 'malignancy', 'chemotherapy', 'brain', 'cerebral', 'neuronal', 'motor', 'cord']


In [15]:
def llm_get_synonyms(data_topic, keyword, n_syn=10):
    prompt = f"""
You are an expert in the {data_topic} domain.

Please list {n_syn} synonyms or alternative expressions for the term: "{keyword}".

The synonyms should be relevant to the {data_topic} context. If the term has multiple meanings, only return synonyms that are appropriate within this context.

Only return a list of synonyms as an array of strings, no explanation.
"""

    output_struct = {
        "type": "array",
        "items": {
            "type": "string"
        }
    }

    synonyms = llm_caller.structed_output(
        prompt=prompt.strip(),
        output_struct=output_struct
    )

    synonyms = [lemmatize_text(t) for t in synonyms]

    return synonyms

In [16]:
print(llm_get_synonyms(
    data_topic=data_desc['data_topic'],
    keyword='hypertension',
    n_syn=5
))

['high blood pressure', 'elevated blood pressure', 'elevated arterial pressure', 'arterial hypertension', 'high arterial pressure']


In [17]:
kw_synonym_dict = {}
for kw in tqdm(keywords, total=len(keywords), desc="Create synonym dict"):
    llm_syn = llm_get_synonyms(data_topic=data_desc['data_topic'], keyword=kw, n_syn=5)
    llm_syn = [lemmatize_text(s) for s in llm_syn]
    kw_synonym_dict.update({
        f"{kw}": [kw] + llm_syn
    })

print(kw_synonym_dict)

Create synonym dict: 100%|██████████| 25/25 [01:16<00:00,  3.07s/it]

{'coronary': ['coronary', 'cardiac', 'heart', 'aortic', 'arterial', 'vascular'], 'myocardial': ['myocardial', 'cardiac', 'heart muscle', 'ventricular', 'atrial', 'coronary'], 'hypertension': ['hypertension', 'high blood pressure', 'elevated blood pressure', 'arterial hypertension', 'chronic hypertension', 'sustained hypertension'], 'cardiac': ['cardiac', 'heart', 'coronary', 'cardiovascular', 'myocardial', 'cardiac muscle'], 'systolic': ['systolic', 'Systole', 'systolic contraction', 'heart contraction phase', 'pump phase', 'systolic ejection'], 'colitis': ['colitis', 'colonic inflammation', 'Inflammation of the large intestine', 'enterocolitis', 'proctocolitis', 'colopathy'], 'esophageal': ['esophageal', 'relate to the esophagus', 'of the esophagus', 'esophageal', 'esophag(o)-', 'gullet'], 'gastrointestinal': ['gastrointestinal', 'digestive', 'enteric', 'abdominal', 'stomach and bowel', 'alimentary'], 'bowel': ['bowel', 'intestine', 'gut', 'bowel tract', 'digestive tract', 'alimentary




In [18]:
texts = train_df['text_lemma']
print(len(texts))
print(keywords)
print(kw_synonym_dict)

250
['coronary', 'myocardial', 'hypertension', 'cardiac', 'systolic', 'colitis', 'esophageal', 'gastrointestinal', 'bowel', 'duodenal', 'defect', 'loss', 'airway', 'graft', 'respiratory', 'cancer', 'carcinoma', 'sarcoma', 'malignancy', 'chemotherapy', 'brain', 'cerebral', 'neuronal', 'motor', 'cord']
{'coronary': ['coronary', 'cardiac', 'heart', 'aortic', 'arterial', 'vascular'], 'myocardial': ['myocardial', 'cardiac', 'heart muscle', 'ventricular', 'atrial', 'coronary'], 'hypertension': ['hypertension', 'high blood pressure', 'elevated blood pressure', 'arterial hypertension', 'chronic hypertension', 'sustained hypertension'], 'cardiac': ['cardiac', 'heart', 'coronary', 'cardiovascular', 'myocardial', 'cardiac muscle'], 'systolic': ['systolic', 'Systole', 'systolic contraction', 'heart contraction phase', 'pump phase', 'systolic ejection'], 'colitis': ['colitis', 'colonic inflammation', 'Inflammation of the large intestine', 'enterocolitis', 'proctocolitis', 'colopathy'], 'esophageal'

In [19]:
def keyword_presence_matrix_from_df(df, keywords, kw_synonym_dict,
                                    text_col='text_column', lemma_col='text_lemma'):
    results = []

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Keyword weak labeling"):
        original_text = row[text_col]
        lemmatized_text = row[lemma_col]

        for kw in keywords:
            syns = kw_synonym_dict.get(kw, [])
            all_terms = [kw] + syns
            score = int(any(term in lemmatized_text for term in all_terms))
            results.append((original_text, kw, score))

    return pd.DataFrame(results, columns=["text", "keyword", "score"])


In [20]:
kw_wl_df = keyword_presence_matrix_from_df(train_df, keywords, kw_synonym_dict, text_column)

Keyword weak labeling: 100%|██████████| 250/250 [00:00<00:00, 6770.86it/s]


In [21]:
kw_wl_df.sample(5)

Unnamed: 0,text,keyword,score
2593,A randomized clinical trial to compare two dif...,malignancy,0
4248,Indications for the surgical treatment of oste...,motor,0
1335,Endoscopic appearance and significance of func...,defect,0
1116,Multivariate analysis in the prediction of dea...,carcinoma,0
5025,Conditioning of the spinal stretch reflex: imp...,coronary,0


In [22]:
kw_wl_df.value_counts('score')

score
0    5683
1     567
Name: count, dtype: int64

In [23]:
print(abstract_concepts)

[{'abstract_concept_name': 'Cardiac Function and Disorders', 'description': "Keywords related to the heart's function and common cardiovascular diseases.", 'keywords': ['coronary', 'myocardial', 'hypertension', 'cardiac', 'systolic'], 'label': 'cardiovascular diseases'}, {'abstract_concept_name': 'Heart Muscle and Blood Pressure', 'description': 'Terms indicating issues with heart muscle and elevated blood pressure.', 'keywords': ['myocardial', 'hypertension', 'cardiac', 'systolic'], 'label': 'cardiovascular diseases'}, {'abstract_concept_name': 'Coronary Artery Issues', 'description': 'Keywords specifically pointing to problems within the coronary arteries.', 'keywords': ['coronary', 'cardiac'], 'label': 'cardiovascular diseases'}, {'abstract_concept_name': 'Intestinal and Esophageal Conditions', 'description': 'Diseases affecting the digestive tract, including the intestines and esophagus.', 'keywords': ['colitis', 'esophageal', 'gastrointestinal', 'bowel', 'duodenal'], 'label': 'dig

In [24]:
def aggregate_full_concept_matrix(wl_df, abstract_concepts):
    # Lọc score == 1 trong wl_df để dễ truy cập
    matched = wl_df[wl_df['score'] == 1]

    # Tạo set {(text, keyword)} đã match
    matched_pairs = set(zip(matched['text'], matched['keyword']))

    # Lấy danh sách unique texts
    texts = wl_df['text'].unique()

    # Kết quả
    results = []

    for text in tqdm(texts, total=len(texts), desc="Abstract concept weak labeling"):
        for concept in abstract_concepts:
            concept_name = concept['abstract_concept_name']
            concept_keywords = concept['keywords']

            # Nếu có ít nhất một keyword trong concept xuất hiện trong matched_pairs → score = 1
            score = int(any((text, kw) in matched_pairs for kw in concept_keywords))

            results.append((text, concept_name, score))

    return pd.DataFrame(results, columns=["text", "abstract_concept", "score"])


In [25]:
abstract_df = aggregate_full_concept_matrix(kw_wl_df, abstract_concepts)

Abstract concept weak labeling: 100%|██████████| 250/250 [00:00<00:00, 133542.54it/s]


In [26]:
abstract_df.sample(5)

Unnamed: 0,text,abstract_concept,score
860,Absorption of carbon 13-labeled rice in milk b...,Inflammatory Bowel Diseases,0
3575,Platelet monoamine oxidase activity in female ...,Inflammatory Bowel Diseases,0
1118,Benefit of ketotifen in patients with eosinoph...,Tissue and Graft Issues,0
825,Estimates of morbidity and mortality rates for...,Cardiac Function and Disorders,0
1871,Intestinal metaplasia is age related in Barret...,Oncological Malignancies,1


In [27]:
abstract_df.value_counts('score')

score
0    2977
1     773
Name: count, dtype: int64

In [28]:
from utils.data_io import join_path, save_csv

In [29]:
save_csv(kw_wl_df, dir=join_path(dataset, 'weak_label_data'), file_name='keyword_wl.csv')

In [30]:
save_csv(abstract_df, dir=join_path(dataset, 'weak_label_data'), file_name='abstract_wl.csv')