In [13]:
import json

# Load the JSON file
with open("dsm5_dataset.json", "r") as f:
    dsm5_data = json.load(f)

# Function to recursively extract all keys with full paths
def extract_keys(data, parent_key=''):
    keys = {}
    if isinstance(data, dict):
        for key, value in data.items():
            full_key = f"{parent_key}.{key}" if parent_key else key
            keys[full_key] = None  # Set to None, as we are only interested in the keys
            keys.update(extract_keys(value, full_key))
    elif isinstance(data, list):
        for index, item in enumerate(data):
            list_key = f"{parent_key}[{index}]" if parent_key else f"[{index}]"
            keys.update(extract_keys(item, list_key))
    return keys

# Extract all keys and convert to JSON format
all_keys = extract_keys(dsm5_data)

# Output the keys as JSON
with open("extracted_keys.json", "w") as outfile:
    json.dump(all_keys, outfile, indent=4)

print("All keys have been saved to extracted_keys.json")



All keys have been saved to extracted_keys.json


In [31]:
import json

# Load the JSON file
with open("dsm5_dataset.json", "r") as f:
    dsm5_data = json.load(f)

# Prepare DSM-5 texts to capture structured pairs for fine-tuning
dsm5_texts = []

# Recursive function to process all nested keys and values into a hierarchical structure
def process_disorder_data(category_name, data, accumulated_path=""):
    """
    Recursively process all keys and values, capturing structured pairs for efficient fine-tuning.

    Parameters:
    - category_name: str - The name of the main or nested disorder category.
    - data: dict - The disorder information at the current level.
    - accumulated_path: str - Accumulated path from the parent level to maintain context.
    """
    # Define the path for the current level
    current_path = f"{accumulated_path} > {category_name}" if accumulated_path else category_name

    # Create an entry to store all key-value pairs within the current structure
    entry = {
        "context": current_path,
        "content": []
    }

    # Loop through each key-value pair
    for key, value in data.items():
        if isinstance(value, dict):
            # Recursively process nested dictionaries (sub-sections)
            process_disorder_data(key, value, accumulated_path=current_path)
        elif isinstance(value, list):
            # Process lists by creating entries for each item, capturing context for each
            for index, item in enumerate(value):
                item_path = f"{current_path} > {key}[{index}]"
                entry["content"].append({
                    "key": item_path,
                    "value": item
                })
        else:
            # Capture each key-value pair at this level with full context
            entry["content"].append({
                "key": f"{current_path} > {key}",
                "value": value
            })

    # Append the completed entry with full context and all key-value pairs
    dsm5_texts.append(entry)

# Process all main categories in the DSM-5 data
for main_category, subcategories in dsm5_data.items():
    # Start processing with an empty accumulated path
    process_disorder_data(main_category, subcategories)

# Print a sample entry to verify
print("Sample entry:", dsm5_texts[0])  # Check the first entry for verification

Sample entry: {'context': 'SOMATIC SYMPTOM AND RELATED DISORDERS > Somatic Symptom Disorder', 'content': [{'key': 'SOMATIC SYMPTOM AND RELATED DISORDERS > Somatic Symptom Disorder > Diagnostic Criteria', 'value': 'A. One or more somatic symptoms that are distressing or result in significant disruption of daily life. B. Excessive thoughts, feelings, or behaviors related to the somatic symptoms or associated health concerns as manifested by at least one of the following: 1. Disproportionate and persistent thoughts about the seriousness of ones symptoms. 2. Persistently high level of anxiety about health or symptoms. 3. Excessive time and energy devoted to these symptoms or health concerns. C. Although any one somatic symptom may not be continuously present, the state of being symptomatic is persistent (typically more than 6 months). Specify if: With predominant pain (previously pain disorder): This specifier is for individuals whose somatic symptoms predominantly involve pain. Specify if

In [33]:
import json

# Load the JSON file
with open("dsm5_dataset.json", "r") as f:
    dsm5_data = json.load(f)

# Prepare DSM-5 texts to capture structured pairs for fine-tuning
dsm5_texts = []

# Recursive function to process all nested keys and values into structured data
def process_disorder_data(category_name, data, accumulated_path=""):
    """
    Recursively process all keys and values, capturing structured pairs for efficient fine-tuning.

    Parameters:
    - category_name: str - The name of the main or nested disorder category.
    - data: dict - The disorder information at the current level.
    - accumulated_path: str - Accumulated path from the parent level to maintain context.
    """
    # Define the path for the current level
    current_path = f"{accumulated_path} > {category_name}" if accumulated_path else category_name

    # Loop through each key-value pair
    for key, value in data.items():
        if isinstance(value, dict):
            # Recursively process nested dictionaries (sub-sections)
            process_disorder_data(key, value, accumulated_path=current_path)
        elif isinstance(value, list):
            # Handle lists by creating separate entries for each item
            for index, item in enumerate(value):
                list_path = f"{current_path} > {key}[{index}]"
                dsm5_texts.append({
                    "input_text": f"{list_path}: {item}",
                    "label": f"{current_path} > {key}"
                })
        else:
            # For each key-value pair, create a distinct entry
            dsm5_texts.append({
                "input_text": f"{current_path} > {key}: {value}",
                "label": f"{current_path} > {key}"
            })

# Process all main categories in the DSM-5 data
for main_category, subcategories in dsm5_data.items():
    # Start processing with an empty accumulated path
    process_disorder_data(main_category, subcategories)

# Print a sample entry to verify
print("Sample entry:", dsm5_texts[0])  # Check the first entry for verification

Sample entry: {'input_text': 'SOMATIC SYMPTOM AND RELATED DISORDERS > Description: This chapter includes the diagnoses of somatic symptom disorder, illness anxiety disorder, functional neurological symptom disorder (conversion disorder), psychological factors affecting other medical conditions, factitious disorder, other specified somatic symptom and related disorder, and unspecified somatic symptom and related disorder. All of the disorders in this chapter share a common feature: the prominence of somatic symptoms and/or illness anxiety associated with significant distress and impairment. Individuals with disorders with prominent somatic symptoms or illness anxiety are commonly encountered in primary care and other medical settings but are less commonly encountered in psychiatric and other mental health settings. These reconceptualized diagnoses, based on a reorganization of DSM-IV somatoform disorder diagnoses, are more useful for primary care and other medical (nonpsychiatric) clini

In [35]:
import json

# Load the JSON file
with open("dsm5_dataset.json", "r") as f:
    dsm5_data = json.load(f)

# Prepare DSM-5 texts as distinct pairs for fine-tuning, where each section is treated as a label
dsm5_texts = []

# Recursive function to process all nested keys and values into labeled sections
def process_disorder_data(category_name, data, accumulated_path=""):
    """
    Recursively process all keys and values, structuring data so each section can be used as a label.

    Parameters:
    - category_name: str - The name of the main or nested disorder category.
    - data: dict - The disorder information at the current level.
    - accumulated_path: str - Accumulated path from the parent level to maintain context.
    """
    # Define the full path for the current level
    current_path = f"{accumulated_path} > {category_name}" if accumulated_path else category_name

    # Loop through each key-value pair
    for key, value in data.items():
        if isinstance(value, dict):
            # Recursively process nested dictionaries (sub-sections)
            process_disorder_data(key, value, accumulated_path=current_path)
        elif isinstance(value, list):
            # For lists, create separate training samples for each item
            for index, item in enumerate(value):
                item_path = f"{current_path} > {key}[{index}]"
                dsm5_texts.append({
                    "input_text": item_path,  # Contextual path as input text
                    "label": item  # Item as label
                })
        else:
            # Each key-value pair at this level is a distinct training sample
            dsm5_texts.append({
                "input_text": f"{current_path} > {key}",  # Path to the section as input
                "label": value  # Content of the section as label
            })

# Process all main categories in the DSM-5 data
for main_category, subcategories in dsm5_data.items():
    # Start processing with an empty accumulated path
    process_disorder_data(main_category, subcategories)

# Print a sample entry to verify
print("Sample entry:", dsm5_texts[0])  # Check the first entry for verification


Sample entry: {'input_text': 'SOMATIC SYMPTOM AND RELATED DISORDERS > Description', 'label': 'This chapter includes the diagnoses of somatic symptom disorder, illness anxiety disorder, functional neurological symptom disorder (conversion disorder), psychological factors affecting other medical conditions, factitious disorder, other specified somatic symptom and related disorder, and unspecified somatic symptom and related disorder. All of the disorders in this chapter share a common feature: the prominence of somatic symptoms and/or illness anxiety associated with significant distress and impairment. Individuals with disorders with prominent somatic symptoms or illness anxiety are commonly encountered in primary care and other medical settings but are less commonly encountered in psychiatric and other mental health settings. These reconceptualized diagnoses, based on a reorganization of DSM-IV somatoform disorder diagnoses, are more useful for primary care and other medical (nonpsychia

In [45]:
import json

# Load the JSON file
with open("dsm5_dataset.json", "r") as f:
    dsm5_data = json.load(f)

# Prepare DSM-5 texts with distinct entries for each section
dsm5_texts = []

# Recursive function to capture all sub-sections within each disorder category
def process_disorder_data(category_name, data, accumulated_path=""):
    """
    Recursively process all keys and values, ensuring each sub-section is added as a labeled entry.

    Parameters:
    - category_name: str - The name of the main or nested disorder category.
    - data: dict - The disorder information at the current level.
    - accumulated_path: str - Accumulated path from the parent level to maintain context.
    """
    # Define the full path for the current level
    current_path = f"{accumulated_path} > {category_name}" if accumulated_path else category_name

    # Loop through each key-value pair to capture all sections
    for key, value in data.items():
        if isinstance(value, dict):
            # Recursively process nested dictionaries (sub-sections)
            process_disorder_data(key, value, accumulated_path=current_path)
        elif isinstance(value, list):
            # Handle lists by creating separate entries for each item
            for index, item in enumerate(value):
                item_path = f"{current_path} > {key}[{index}]"
                dsm5_texts.append({
                    "input_text": item_path,
                    "label": item
                })
        else:
            # Treat each key-value pair as a distinct entry
            dsm5_texts.append({
                "input_text": f"{current_path} > {key}",  # Full path to the specific section
                "label": value  # The content of that specific section
            })

# Process all main categories in the DSM-5 data
for main_category, subcategories in dsm5_data.items():
    process_disorder_data(main_category, subcategories)

# Print a sample entry to verify
print("Sample entry:", dsm5_texts[0])  # Check the first entry for verification



Sample entry: {'input_text': 'SOMATIC SYMPTOM AND RELATED DISORDERS > Description', 'label': 'This chapter includes the diagnoses of somatic symptom disorder, illness anxiety disorder, functional neurological symptom disorder (conversion disorder), psychological factors affecting other medical conditions, factitious disorder, other specified somatic symptom and related disorder, and unspecified somatic symptom and related disorder. All of the disorders in this chapter share a common feature: the prominence of somatic symptoms and/or illness anxiety associated with significant distress and impairment. Individuals with disorders with prominent somatic symptoms or illness anxiety are commonly encountered in primary care and other medical settings but are less commonly encountered in psychiatric and other mental health settings. These reconceptualized diagnoses, based on a reorganization of DSM-IV somatoform disorder diagnoses, are more useful for primary care and other medical (nonpsychia

In [47]:
# Load the JSON data
with open('dsm5_dataset.json', 'r') as f:
    data = json.load(f)

# Preprocess the data
def flatten_data(data, prefix=''):
    flattened = []
    for key, value in data.items():
        if isinstance(value, dict):
            flattened.extend(flatten_data(value, prefix=f"{prefix}{key} | "))
        else:
            flattened.append((f"{prefix}{key}", value))
    return flattened

flattened_data = flatten_data(data)

# Prepare the dataset
texts = [f"{label}: {content}" for label, content in flattened_data]
labels = [label for label, _ in flattened_data]

print(texts)

['SOMATIC SYMPTOM AND RELATED DISORDERS | Description: This chapter includes the diagnoses of somatic symptom disorder, illness anxiety disorder, functional neurological symptom disorder (conversion disorder), psychological factors affecting other medical conditions, factitious disorder, other specified somatic symptom and related disorder, and unspecified somatic symptom and related disorder. All of the disorders in this chapter share a common feature: the prominence of somatic symptoms and/or illness anxiety associated with significant distress and impairment. Individuals with disorders with prominent somatic symptoms or illness anxiety are commonly encountered in primary care and other medical settings but are less commonly encountered in psychiatric and other mental health settings. These reconceptualized diagnoses, based on a reorganization of DSM-IV somatoform disorder diagnoses, are more useful for primary care and other medical (nonpsychiatric) clinicians. The major diagnosis i

In [51]:
import json

def extract_keys(data):
    if isinstance(data, dict):
        return {k: extract_keys(v) for k, v in data.items()}
    elif isinstance(data, list):
        return [extract_keys(item) for item in data]
    else:
        return None

# Read the JSON file
with open('disorders_icd_codes.json', 'r') as file:
    data = json.load(file)

# Extract the keys
keys_hierarchy = extract_keys(data)

# Write the result to a new JSON file
with open('icd_10_keys_hierarchy.json', 'w') as file:
    json.dump(keys_hierarchy, file, indent=2)

print("Keys hierarchy has been extracted and saved to 'dsm5_keys_hierarchy.json'")

Keys hierarchy has been extracted and saved to 'dsm5_keys_hierarchy.json'


In [59]:
import json

def extract_hierarchy(json_data, parent_key=""):
    """
    Recursively traverse JSON structure, capturing hierarchy and content.
    Returns a list of dictionaries with 'label' and 'target' keys.
    """
    label_target_pairs = []
    
    # Traverse each key-value pair in JSON
    for key, value in json_data.items():
        # Concatenate parent keys to form the hierarchical path
        current_label = f"{parent_key} > {key}" if parent_key else key
        
        if isinstance(value, dict):
            # Recursive call to handle nested dictionaries
            label_target_pairs.extend(extract_hierarchy(value, current_label))
        else:
            # Capture label and content if the value is a string (assumed content)
            label_target_pairs.append({"label": current_label, "target": value})
    
    return label_target_pairs

# Load the JSON data
with open("dsm5_dataset.json", "r") as file:
    json_data = json.load(file)

# Extract hierarchy and context
hierarchical_data = extract_hierarchy(json_data)

# Display or save the output for fine-tuning
for entry in hierarchical_data:
    print(entry)  # Or save to a file, or preprocess further if needed

{'label': 'SOMATIC SYMPTOM AND RELATED DISORDERS > Description', 'target': 'This chapter includes the diagnoses of somatic symptom disorder, illness anxiety disorder, functional neurological symptom disorder (conversion disorder), psychological factors affecting other medical conditions, factitious disorder, other specified somatic symptom and related disorder, and unspecified somatic symptom and related disorder. All of the disorders in this chapter share a common feature: the prominence of somatic symptoms and/or illness anxiety associated with significant distress and impairment. Individuals with disorders with prominent somatic symptoms or illness anxiety are commonly encountered in primary care and other medical settings but are less commonly encountered in psychiatric and other mental health settings. These reconceptualized diagnoses, based on a reorganization of DSM-IV somatoform disorder diagnoses, are more useful for primary care and other medical (nonpsychiatric) clinicians. 

In [63]:
import json

def extract_labels_and_targets(data, path=""):
    """
    Extract all hierarchical paths and their corresponding descriptions in a structured format.
    """
    result = []
    
    for key, value in data.items():
        # Form the current path to include the hierarchical context
        current_path = f"{path} > {key}" if path else key
        
        if isinstance(value, dict):
            # Recursively extract from nested dictionaries
            result.extend(extract_labels_and_targets(value, current_path))
        else:
            # Append each path-content pair
            result.append({"label": current_path, "target": value})
    
    return result

# Load the JSON data
with open("dsm5_dataset.json", "r") as file:
    json_data = json.load(file)

# Process the JSON to extract context-based labels and their content
processed_data = extract_labels_and_targets(json_data)

# Display sample output
for entry in processed_data[:2]:  # Display the first two entries for a quick check
    print(entry)



{'label': 'SOMATIC SYMPTOM AND RELATED DISORDERS > Description', 'target': 'This chapter includes the diagnoses of somatic symptom disorder, illness anxiety disorder, functional neurological symptom disorder (conversion disorder), psychological factors affecting other medical conditions, factitious disorder, other specified somatic symptom and related disorder, and unspecified somatic symptom and related disorder. All of the disorders in this chapter share a common feature: the prominence of somatic symptoms and/or illness anxiety associated with significant distress and impairment. Individuals with disorders with prominent somatic symptoms or illness anxiety are commonly encountered in primary care and other medical settings but are less commonly encountered in psychiatric and other mental health settings. These reconceptualized diagnoses, based on a reorganization of DSM-IV somatoform disorder diagnoses, are more useful for primary care and other medical (nonpsychiatric) clinicians. 

In [73]:
import json

# Load JSON dataset
with open("dsm5_dataset.json", "r") as f:
    data = json.load(f)

# Recursive function to capture hierarchical labels and their target values
def extract_labels_and_targets(data, parent_key=""):
    entries = []
    for key, value in data.items():
        # Construct the current label path
        full_label = f"{parent_key} > {key}" if parent_key else key
        
        if isinstance(value, dict):
            # Recurse if the value is a nested dictionary
            entries.extend(extract_labels_and_targets(value, full_label))
        else:
            # Add entry with full hierarchical label path and target content
            entries.append({
                "label": full_label,
                "target": value.strip() if isinstance(value, str) else ""
            })
    return entries

# Extract entries
dsm5_entries = extract_labels_and_targets(data)

# Optional: remove duplicates or apply other transformations if needed
unique_entries = {entry['label']: entry for entry in dsm5_entries}  # Removing duplicates by label
dsm5_entries = list(unique_entries.values())  # Convert back to list format

# Check the first few entries to verify the output
for entry in dsm5_entries[:3]:  # Show only a sample for verification
    print(json.dumps(entry, indent = 4))


{
    "label": "SOMATIC SYMPTOM AND RELATED DISORDERS > Description",
    "target": "This chapter includes the diagnoses of somatic symptom disorder, illness anxiety disorder, functional neurological symptom disorder (conversion disorder), psychological factors affecting other medical conditions, factitious disorder, other specified somatic symptom and related disorder, and unspecified somatic symptom and related disorder. All of the disorders in this chapter share a common feature: the prominence of somatic symptoms and/or illness anxiety associated with significant distress and impairment. Individuals with disorders with prominent somatic symptoms or illness anxiety are commonly encountered in primary care and other medical settings but are less commonly encountered in psychiatric and other mental health settings. These reconceptualized diagnoses, based on a reorganization of DSM-IV somatoform disorder diagnoses, are more useful for primary care and other medical (nonpsychiatric) cli

In [77]:
import json

# Load JSON dataset
with open("disorders_icd_codes.json", "r") as f:
    data = json.load(f)

# Recursive function to capture hierarchical labels and their target values
def extract_labels_and_targets(data, parent_key=""):
    entries = []
    for key, value in data.items():
        # Construct the current label path
        full_label = f"{parent_key} > {key}" if parent_key else key
        
        if isinstance(value, dict):
            # Recurse if the value is a nested dictionary
            entries.extend(extract_labels_and_targets(value, full_label))
        else:
            # Add entry with full hierarchical label path and target content
            entries.append({
                "label": full_label,
                "target": value.strip() if isinstance(value, str) else ""
            })
    return entries

# Extract entries
dsm5_entries = extract_labels_and_targets(data)

# Optional: remove duplicates by converting to a dictionary by label
unique_entries = {entry['label']: entry for entry in dsm5_entries}
dsm5_entries = list(unique_entries.values())

# Output all entries in the expected JSON list format
with open("icd_codes_finetune_ready.json", "w") as output_file:
    json.dump(dsm5_entries, output_file, indent=4)

print("Output written to dsm5_finetune_ready.json")


Output written to dsm5_finetune_ready.json
