In [1]:
# Parameters
test = True


In [2]:
# # Param
# test = "True"

In [3]:
import os
print(os.getcwd())

/Users/dauduchieu/Documents/iSE2025/CBM


In [4]:
is_test = True
if test == "False" or test == False:
    is_test = False

In [5]:
dataset = "data"

In [6]:
from utils.data_loader import DataLoader

In [7]:
data_loader = DataLoader(dataset)

In [8]:
train_df = data_loader.get_data_train()

In [9]:
data_desc = data_loader.get_data_desc()
label_column = data_desc['label_column']
text_column = data_desc['text_column']

In [10]:
labels = train_df[label_column].unique()
labels

array(['general pathological conditions', 'neoplasms',
       'digestive system diseases', 'nervous system diseases',
       'cardiovascular diseases'], dtype=object)

In [11]:
if is_test:
    train_df = train_df.groupby(label_column).sample(50)

In [12]:
from abc import ABC, abstractmethod
class LLMCaller(ABC):
    @abstractmethod
    def structed_output(self, prompt:str, output_struct):
        pass

In [13]:
from time import time, sleep
from typing import List, Callable, TypeVar

T = TypeVar('T')

class GeminiRateLimiter:
    def __init__(self, requests_per_minute: int = 15):
        self.rpm = requests_per_minute
        self.times: List[float] = []

    def wait(self):
        now = time()
        self.times = [t for t in self.times if now - t <= 60]
        if len(self.times) >= self.rpm:
            sleep(60 - (now - self.times[0]))
            now = time()
            self.times = [t for t in self.times if now - t <= 60]

    def record(self):
        self.times.append(time())

    def execute(self, f: Callable[..., T], *args, **kwargs) -> T:
        self.wait()
        try:
            r = f(*args, **kwargs)
            self.record()
            return r
        except:
            self.record()
            raise

    def __call__(self, f: Callable[..., T]) -> Callable[..., T]:
        def wrapper(*args, **kwargs):
            return self.execute(f, *args, **kwargs)
        return wrapper

    def __enter__(self):
        self.wait()
        return self

    def __exit__(self, *args):
        self.record()

In [14]:
from google import genai

class GeminiAPICaller(LLMCaller):
    def __init__(self, api_key:str, api_model:str, api_rpm:int):
        self.client = genai.Client(api_key=api_key)
        self.api_model = api_model
        self.rate_limiter = GeminiRateLimiter(requests_per_minute=api_rpm)

    def structed_output(self, prompt:str, output_struct):
        with self.rate_limiter:
            response = self.client.models.generate_content(
                model=self.api_model,
                contents=prompt,
                config={
                    'response_mime_type': 'application/json',
                    'response_schema': output_struct,
                },
            )

        res = response.parsed
        return res

In [15]:
llm_api_config = data_loader.get_llm_config()

llm_caller = GeminiAPICaller(
    api_key=llm_api_config['api_key'],
    api_model=llm_api_config['model'],
    api_rpm=llm_api_config['rate_per_minute']
)

In [16]:
import numpy as np
import spacy
from collections import defaultdict
import math
from tqdm import tqdm

In [17]:
nlp = spacy.load("en_core_web_sm")

In [18]:
def extract_custom_candidates(text, use_pos=True, pos_list=None, use_ner=True, use_chunks=True):
    doc = nlp(text)
    candidates = []

    if use_pos:
        if pos_list is None:
            pos_list = ["NOUN", "PROPN", "ADJ"]
        for token in doc:
            if token.pos_ in pos_list and not token.is_stop and token.is_alpha:
                candidates.append(token.lemma_.lower())

    if use_ner:
        for ent in doc.ents:
            lemmatized_ent_tokens = [token.lemma_.lower() for token in nlp(ent.text)]
            candidates.append(" ".join(lemmatized_ent_tokens))

    if use_chunks:
        for chunk in doc.noun_chunks:
            lemmatized_chunk_tokens = [token.lemma_.lower() for token in chunk if not token.is_stop and token.is_alpha]
            chunk_text = " ".join(lemmatized_chunk_tokens).strip()
            if chunk_text:
                candidates.append(chunk_text)

    return candidates

In [19]:
def extract_keywords_with_df_ilf(df, text_column, label_column, top_n=15,
    use_pos=True, pos_list=None, use_ner=True, use_chunks=True, smooth=True,
    threshold=0.02
):
    df = df.copy()
    labels = df[label_column].unique()
    num_labels = len(labels)

    doc_freq_by_label = defaultdict(lambda: defaultdict(int))  # DF
    label_counts = defaultdict(int)

    for label in labels:
        texts = df[df[label_column] == label][text_column].astype(str).tolist()
        label_counts[label] = len(texts)

        for text in tqdm(texts, total=len(texts), desc=f"Label: {label}"):
            candidates = extract_custom_candidates(
                text, use_pos=use_pos, pos_list=pos_list,
                use_ner=use_ner, use_chunks=use_chunks
            )
            unique_terms = set(candidates)
            for term in unique_terms:
                doc_freq_by_label[label][term] += 1

    all_terms = set(term for label_terms in doc_freq_by_label.values() for term in label_terms)
    ilf_scores = defaultdict(float)

    # ILF
    for term in all_terms:
        label_occurrences = sum(
            1 for label in labels
            if (doc_freq_by_label[label][term] / label_counts[label]) > threshold
        )
        ilf_scores[term] = np.log(num_labels / label_occurrences) if label_occurrences > 0 else 0

    ranked_keywords_per_label = defaultdict(list)

    for label in labels:
        keyword_scores = {}
        for term, df_val in doc_freq_by_label[label].items():
            normalized_df = df_val / label_counts[label]
            df_score = np.log(1 + normalized_df) if smooth else normalized_df
            keyword_scores[term] = df_score * ilf_scores[term]
        sorted_keywords = sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True)
        ranked_keywords_per_label[label] = [kw for kw, score in sorted_keywords[:top_n]]

    return ranked_keywords_per_label

In [20]:
keywords = extract_keywords_with_df_ilf(
    df=train_df,
    text_column=text_column,
    label_column=label_column,
    top_n=10,
    use_pos=True,
    pos_list=["NOUN", "PROPN", "ADJ"],
    use_ner=False,
    use_chunks=True,
    threshold=0.02
)

Label: cardiovascular diseases:   0%|                                                                                                                                  | 0/50 [00:00<?, ?it/s]

Label: cardiovascular diseases:   4%|████▉                                                                                                                     | 2/50 [00:00<00:02, 19.99it/s]

Label: cardiovascular diseases:  12%|██████████████▋                                                                                                           | 6/50 [00:00<00:01, 30.35it/s]

Label: cardiovascular diseases:  20%|████████████████████████▏                                                                                                | 10/50 [00:00<00:01, 30.86it/s]

Label: cardiovascular diseases:  28%|█████████████████████████████████▉                                                                                       | 14/50 [00:00<00:01, 25.32it/s]

Label: cardiovascular diseases:  36%|███████████████████████████████████████████▌                                                                             | 18/50 [00:00<00:01, 24.82it/s]

Label: cardiovascular diseases:  42%|██████████████████████████████████████████████████▊                                                                      | 21/50 [00:00<00:01, 24.58it/s]

Label: cardiovascular diseases:  48%|██████████████████████████████████████████████████████████                                                               | 24/50 [00:00<00:01, 22.98it/s]

Label: cardiovascular diseases:  56%|███████████████████████████████████████████████████████████████████▊                                                     | 28/50 [00:01<00:00, 26.83it/s]

Label: cardiovascular diseases:  62%|███████████████████████████████████████████████████████████████████████████                                              | 31/50 [00:01<00:00, 25.22it/s]

Label: cardiovascular diseases:  70%|████████████████████████████████████████████████████████████████████████████████████▋                                    | 35/50 [00:01<00:00, 28.10it/s]

Label: cardiovascular diseases:  76%|███████████████████████████████████████████████████████████████████████████████████████████▉                             | 38/50 [00:01<00:00, 24.12it/s]

Label: cardiovascular diseases:  82%|███████████████████████████████████████████████████████████████████████████████████████████████████▏                     | 41/50 [00:01<00:00, 22.86it/s]

Label: cardiovascular diseases:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▍              | 44/50 [00:01<00:00, 22.46it/s]

Label: cardiovascular diseases:  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋       | 47/50 [00:01<00:00, 23.07it/s]

Label: cardiovascular diseases: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:02<00:00, 23.84it/s]

Label: cardiovascular diseases: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:02<00:00, 24.68it/s]




Label: digestive system diseases:   0%|                                                                                                                                | 0/50 [00:00<?, ?it/s]

Label: digestive system diseases:   6%|███████▏                                                                                                                | 3/50 [00:00<00:01, 28.14it/s]

Label: digestive system diseases:  14%|████████████████▊                                                                                                       | 7/50 [00:00<00:01, 32.95it/s]

Label: digestive system diseases:  22%|██████████████████████████▏                                                                                            | 11/50 [00:00<00:01, 30.46it/s]

Label: digestive system diseases:  30%|███████████████████████████████████▋                                                                                   | 15/50 [00:00<00:01, 29.08it/s]

Label: digestive system diseases:  40%|███████████████████████████████████████████████▌                                                                       | 20/50 [00:00<00:00, 33.88it/s]

Label: digestive system diseases:  48%|█████████████████████████████████████████████████████████                                                              | 24/50 [00:00<00:00, 35.58it/s]

Label: digestive system diseases:  56%|██████████████████████████████████████████████████████████████████▋                                                    | 28/50 [00:00<00:00, 35.11it/s]

Label: digestive system diseases:  64%|████████████████████████████████████████████████████████████████████████████▏                                          | 32/50 [00:00<00:00, 31.81it/s]

Label: digestive system diseases:  72%|█████████████████████████████████████████████████████████████████████████████████████▋                                 | 36/50 [00:01<00:00, 30.40it/s]

Label: digestive system diseases:  80%|███████████████████████████████████████████████████████████████████████████████████████████████▏                       | 40/50 [00:01<00:00, 29.39it/s]

Label: digestive system diseases:  88%|████████████████████████████████████████████████████████████████████████████████████████████████████████▋              | 44/50 [00:01<00:00, 29.35it/s]

Label: digestive system diseases:  94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 47/50 [00:01<00:00, 27.41it/s]

Label: digestive system diseases: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 26.87it/s]

Label: digestive system diseases: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 29.99it/s]




Label: general pathological conditions:   0%|                                                                                                                          | 0/50 [00:00<?, ?it/s]

Label: general pathological conditions:   6%|██████▊                                                                                                           | 3/50 [00:00<00:01, 28.31it/s]

Label: general pathological conditions:  14%|███████████████▉                                                                                                  | 7/50 [00:00<00:01, 29.65it/s]

Label: general pathological conditions:  22%|████████████████████████▊                                                                                        | 11/50 [00:00<00:01, 33.51it/s]

Label: general pathological conditions:  30%|█████████████████████████████████▉                                                                               | 15/50 [00:00<00:00, 35.27it/s]

Label: general pathological conditions:  40%|█████████████████████████████████████████████▏                                                                   | 20/50 [00:00<00:00, 36.03it/s]

Label: general pathological conditions:  48%|██████████████████████████████████████████████████████▏                                                          | 24/50 [00:00<00:00, 31.35it/s]

Label: general pathological conditions:  56%|███████████████████████████████████████████████████████████████▎                                                 | 28/50 [00:00<00:00, 31.92it/s]

Label: general pathological conditions:  70%|███████████████████████████████████████████████████████████████████████████████                                  | 35/50 [00:00<00:00, 41.63it/s]

Label: general pathological conditions:  80%|██████████████████████████████████████████████████████████████████████████████████████████▍                      | 40/50 [00:01<00:00, 37.19it/s]

Label: general pathological conditions:  88%|███████████████████████████████████████████████████████████████████████████████████████████████████▍             | 44/50 [00:01<00:00, 32.22it/s]

Label: general pathological conditions:  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍    | 48/50 [00:01<00:00, 31.11it/s]

Label: general pathological conditions: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 33.59it/s]




Label: neoplasms:   0%|                                                                                                                                                | 0/50 [00:00<?, ?it/s]

Label: neoplasms:   6%|████████▏                                                                                                                               | 3/50 [00:00<00:01, 29.71it/s]

Label: neoplasms:  14%|███████████████████                                                                                                                     | 7/50 [00:00<00:01, 32.85it/s]

Label: neoplasms:  22%|█████████████████████████████▋                                                                                                         | 11/50 [00:00<00:01, 28.38it/s]

Label: neoplasms:  28%|█████████████████████████████████████▊                                                                                                 | 14/50 [00:00<00:01, 28.46it/s]

Label: neoplasms:  36%|████████████████████████████████████████████████▌                                                                                      | 18/50 [00:00<00:01, 31.87it/s]

Label: neoplasms:  46%|██████████████████████████████████████████████████████████████                                                                         | 23/50 [00:00<00:00, 36.09it/s]

Label: neoplasms:  54%|████████████████████████████████████████████████████████████████████████▉                                                              | 27/50 [00:00<00:00, 33.67it/s]

Label: neoplasms:  62%|███████████████████████████████████████████████████████████████████████████████████▋                                                   | 31/50 [00:00<00:00, 33.61it/s]

Label: neoplasms:  70%|██████████████████████████████████████████████████████████████████████████████████████████████▌                                        | 35/50 [00:01<00:00, 32.66it/s]

Label: neoplasms:  78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▎                             | 39/50 [00:01<00:00, 31.59it/s]

Label: neoplasms:  86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                   | 43/50 [00:01<00:00, 31.61it/s]

Label: neoplasms:  94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉        | 47/50 [00:01<00:00, 31.31it/s]

Label: neoplasms: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 32.48it/s]




Label: nervous system diseases:   0%|                                                                                                                                  | 0/50 [00:00<?, ?it/s]

Label: nervous system diseases:   8%|█████████▊                                                                                                                | 4/50 [00:00<00:01, 35.94it/s]

Label: nervous system diseases:  18%|█████████████████████▉                                                                                                    | 9/50 [00:00<00:01, 39.76it/s]

Label: nervous system diseases:  26%|███████████████████████████████▍                                                                                         | 13/50 [00:00<00:00, 38.00it/s]

Label: nervous system diseases:  38%|█████████████████████████████████████████████▉                                                                           | 19/50 [00:00<00:00, 43.79it/s]

Label: nervous system diseases:  48%|██████████████████████████████████████████████████████████                                                               | 24/50 [00:00<00:00, 36.49it/s]

Label: nervous system diseases:  56%|███████████████████████████████████████████████████████████████████▊                                                     | 28/50 [00:00<00:00, 34.95it/s]

Label: nervous system diseases:  64%|█████████████████████████████████████████████████████████████████████████████▍                                           | 32/50 [00:00<00:00, 35.16it/s]

Label: nervous system diseases:  72%|███████████████████████████████████████████████████████████████████████████████████████                                  | 36/50 [00:01<00:00, 32.83it/s]

Label: nervous system diseases:  84%|█████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 42/50 [00:01<00:00, 35.85it/s]

Label: nervous system diseases:  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏    | 48/50 [00:01<00:00, 37.30it/s]

Label: nervous system diseases: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 35.97it/s]




In [21]:
keywords = dict(keywords)
print(keywords)

{'cardiovascular diseases': ['myocardial', 'coronary', 'left', 'heart', 'hypertension', 'ventricular', 'infarction', 'diastolic', 'ischemia', 'min'], 'digestive system diseases': ['obstruction', 'bowel', 'biliary', 'endoscopic', 'stone', 'meal', 'bile', 'pancreatitis', 'ulcer', 'absorption'], 'general pathological conditions': ['hip', 'deficiency', 'mouse', 'eye', 'survey', 'bone', 'disorder', 'oncogene', 'nature', 'necrosis'], 'neoplasms': ['lymph', 'node', 'tumor', 'carcinoma', 'breast', 'metastasis', 'adenocarcinoma', 'chemotherapy', 'classification', 'malignant'], 'nervous system diseases': ['spinal', 'deficit', 'mental', 'neurologic', 'impairment', 'cognitive', 'epilepsy', 'way', 'cord', 'seizure']}


In [22]:
def llm_filter_keyword_concepts(keyword_concepts, data_topic, n_concepts=5):
    """
    Filter keyword concepts for each label with additional context of keywords from other labels.
    
    Args:
        keyword_concepts (dict): Mapping from label name to a list of raw keywords.
        data_topic (str): The domain context (e.g., 'biomedical research abstracts').
        n_concepts (int): Number of keywords to keep per label.

    Returns:
        dict: Mapping from label name to list of filtered keywords.
    """
    filtered_keywords_by_label = {}
    all_labels = list(keyword_concepts.keys())

    for target_label in all_labels:
        target_keywords = keyword_concepts[target_label]
        keyword_list = ", ".join(target_keywords)

        # Add reference keywords from other labels
        competing_info = ""
        for other_label in all_labels:
            if other_label == target_label:
                continue
            other_keywords = keyword_concepts[other_label]
            other_kw_str = ", ".join(other_keywords)
            competing_info += f'- {other_label}: {other_kw_str}\n'

        # Construct prompt
        prompt = f"""
You are a domain expert working on the topic: "{data_topic}".

Your task is to select the top {n_concepts} most meaningful and representative terms for the category "{target_label}".

---

### CATEGORY:
"{target_label}"

### RAW EXTRACTED KEYWORDS:
{keyword_list}

---

### REFERENCE KEYWORDS FROM OTHER CATEGORIES:
(These are keywords from competing labels. Use them to ensure your selections are distinctive.)

{competing_info}

---

### INSTRUCTIONS:
- Select exactly {n_concepts} keywords from the provided list under RAW EXTRACTED KEYWORDS.
- Chosen terms should:
  - Be highly relevant and representative of the target category
  - Clearly distinguish the target category from others
  - Be specific, unambiguous, and non-generic

Avoid:
- Generic terms (e.g., "thing", "aspect", "problem")
- Redundant or overlapping words
- Keywords similar to those in other categories

Return your answer as a JSON object in the format:
{{ "{target_label}": ["keyword1", "keyword2", ...] }}
""".strip()

        output_struct = {
            "type": "object",
            "properties": {
                target_label: {
                    "type": "array",
                    "items": {
                        "type": "string"
                    }
                }
            },
            "required": [target_label]
        }

        result = llm_caller.structed_output(prompt=prompt, output_struct=output_struct)
        filtered_keywords_by_label[target_label] = result.get(target_label, [])

    return filtered_keywords_by_label

In [23]:
keyword_concepts = llm_filter_keyword_concepts(keywords, data_desc['data_topic'], n_concepts=5)

In [24]:
keyword_concepts

{'cardiovascular diseases': ['hypertension',
  'myocardial',
  'infarction',
  'ischemia',
  'coronary'],
 'digestive system diseases': ['pancreatitis',
  'ulcer',
  'obstruction',
  'biliary',
  'bowel'],
 'general pathological conditions': ['deficiency',
  'disorder',
  'necrosis',
  'oncogene',
  'bone'],
 'neoplasms': ['tumor',
  'carcinoma',
  'adenocarcinoma',
  'malignant',
  'metastasis'],
 'nervous system diseases': ['neurologic',
  'epilepsy',
  'seizure',
  'spinal',
  'cord']}

In [25]:
def abstract_concept_prompt(
    data_topic,
    text_column, label_column,
    text_description, label_description,
    label_concepts,
    keyword_concepts, 
    min_abstract_concepts_per_label=3, max_abstract_concepts_per_label=4
):
 

    label_concepts = [str(l) for l in label_concepts]
    label_concepts_str = "\n- " + "\n- ".join(label_concepts)

    keyword_list_str = ""
    for label, keywords in keyword_concepts.items():
        for kw in keywords:
            keyword_list_str += f"- Keyword: \"{kw}\", Label: \"{label}\"\n"

    constraint_instruction = f"- For each label, you must group the keywords into at least {min_abstract_concepts_per_label}"
    if max_abstract_concepts_per_label is not None:
        constraint_instruction += f" and at most {max_abstract_concepts_per_label}"
    constraint_instruction += " distinct and meaningful abstract concepts."

    prompt = f"""
Data context:

- Data topic: {data_topic}
- Text column description: {text_description}
- Label column description: {label_description}

Input:

- List of label concepts in the dataset: {label_concepts_str}

- List of extracted keyword concepts, each associated with a label:
{keyword_list_str}

Important instructions:

- A keyword can belong to multiple abstract concepts if semantically appropriate.
- Likewise, an abstract concept can contain multiple keywords.
- The abstract concepts must act as meaningful intermediate concepts between keywords and labels.
- Abstract concepts must be:
  - More specific than the label concept.
  - More general and meaningful than individual keywords.
  - They should never simply repeat or copy the label names.

{constraint_instruction}

Your task:

- Group the keywords into abstract, higher-level concepts (abstract concepts).
- Each abstract concept must:
  - Include a list of related keywords (allowing keyword overlap across groups if appropriate).
  - Have a clear, concise, and meaningful name relevant to the {data_topic} domain.
  - Be clearly mapped to one of the provided label concepts.

Expected output:

- A valid JSON array where each item represents one abstract concept.
- Each item contains:
  - "abstract_concept_name": Name of the abstract concept (cannot be identical to the label).
  - "description": Optional short description (1-2 sentences) explaining the abstract concept.
  - "keywords": List of related keywords (strings).
  - "label": The corresponding label concept from the provided list.

Note: Only return valid JSON. Do not add explanations, comments, or extra text.
""".strip()

    return prompt

In [26]:
abstract_prompt = abstract_concept_prompt(
    data_topic=data_desc['data_topic'],
    text_column=data_desc['text_column'],
    label_column=data_desc['label_column'],
    text_description=data_desc['text_description'],
    label_description=data_desc['label_description'],
    label_concepts=labels,
    keyword_concepts=keyword_concepts
)

In [27]:
output_struct = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "abstract_concept_name": {"type": "string"},
            "description": {"type": "string"},
            "keywords": {
                "type": "array",
                "items": {"type": "string"}
            },
            "label": {"type": "string"}
        },
        "required": ["abstract_concept_name", "keywords", "label"]
    }
}

In [28]:
abstract_concepts = llm_caller.structed_output(
    prompt=abstract_prompt,
    output_struct=output_struct
)

abstract_concepts

[{'abstract_concept_name': 'Cardiovascular Conditions',
  'description': 'Diseases affecting the heart and blood vessels.',
  'keywords': ['hypertension',
   'myocardial',
   'infarction',
   'ischemia',
   'coronary'],
  'label': 'cardiovascular diseases'},
 {'abstract_concept_name': 'Gastrointestinal Tract Disorders',
  'description': 'Diseases affecting the digestive system, including the stomach, intestines, and associated organs.',
  'keywords': ['pancreatitis', 'ulcer', 'obstruction', 'biliary', 'bowel'],
  'label': 'digestive system diseases'},
 {'abstract_concept_name': 'General Pathological States',
  'description': 'Broad categories of abnormal conditions affecting the body.',
  'keywords': ['deficiency', 'disorder', 'necrosis', 'oncogene', 'bone'],
  'label': 'general pathological conditions'},
 {'abstract_concept_name': 'Neoplastic Growths',
  'description': 'Abnormal and uncontrolled cell growth, including tumors and cancers.',
  'keywords': ['tumor',
   'carcinoma',
   'a

In [29]:
from utils.data_io import join_path, save_json

In [30]:
save_json(obj=keyword_concepts, dir=join_path(dataset, 'concepts'), file_name='keyword_concepts.json')

In [31]:
save_json(obj=abstract_concepts, dir=join_path(dataset, 'concepts'), file_name='abstract_concepts.json')