In [None]:
import numpy as np
import pandas as pd
import os
import logging
import csv
import re # Import regular expressions

# --- Configuration (Ensure these match your notebook setup) ---
T_HOURS = 48
N_BINS = 20
SEED = 0
logger = logging.getLogger("CombinedTokenInfo")

PROJECT_ROOT = "/changed" # Adjust if needed
DATA_ROOT_DIR = os.path.join(PROJECT_ROOT, "final_data") # Or your output_dir
RESULTS_DIR = os.path.join(PROJECT_ROOT, "results")

# --- Paths ---
dict_dir = os.path.join(DATA_ROOT_DIR, 'dictionaries')
token_map_path = os.path.join(dict_dir, f'{T_HOURS}_{SEED}_{N_BINS}-token2index.npy')
boundaries_path = os.path.join(dict_dir, f'{T_HOURS}_{SEED}_{N_BINS}-bin_boundaries.npy') # Needs to be generated by modified script 7
value_dict_path = os.path.join(dict_dir, f'{T_HOURS}-{SEED}-values.npy') # Generated by script 6
d_items_path = '/D_ITEMS.csv'
d_labitems_path = '/D_LABITEMS.csv'

# Define the output CSV file path
output_csv_path = os.path.join(dict_dir, f'{T_HOURS}_{SEED}_{N_BINS}-token_combined_info.csv')
logger.info(f"Combined token info CSV will be saved to: {output_csv_path}")
os.makedirs(os.path.dirname(output_csv_path), exist_ok=True) # Ensure directory exists

logger.info("--- Starting Combined Token Info Generation ---")

# --- 1. Load Token Map ---
try:
    logger.info(f"Loading token map from: {token_map_path}")
    token2index = np.load(token_map_path, allow_pickle=True).item()
    index2token = {v: k for k, v in token2index.items()}
    logger.info(f"Loaded {len(token2index)} tokens.")
except FileNotFoundError:
    logger.error(f"Error: Token map file not found at {token_map_path}")
    raise
except Exception as e:
    logger.error(f"Error loading token map: {e}")
    raise

# --- 2. Load Bin Boundaries ---
try:
    logger.info(f"Loading bin boundaries from: {boundaries_path}")
    bin_boundaries = np.load(boundaries_path, allow_pickle=True).item()
    logger.info(f"Loaded boundaries for {len(bin_boundaries)} binned variables.")
except FileNotFoundError:
    logger.error(f"Error: Bin boundaries file not found at {boundaries_path}. Did you modify and re-run script 7?")
    raise
except Exception as e:
    logger.error(f"Error loading bin boundaries: {e}")
    raise

# --- 3. Load Value Dictionary ---
try:
    logger.info(f"Loading value dictionary from: {value_dict_path}")
    value_dictionary = np.load(value_dict_path, allow_pickle=True).item()
    logger.info(f"Loaded value dictionary for {len(value_dictionary)} ITEMID_UOM keys.")
except FileNotFoundError:
    logger.error(f"Error: Value dictionary file not found at {value_dict_path}")
    raise
except Exception as e:
    logger.error(f"Error loading value dictionary: {e}")
    raise

# --- 4. Load MIMIC Dictionary CSVs ---
itemid_to_label = {}
try:
    # (Loading D_ITEMS and D_LABITEMS - same as before)
    logger.info(f"Loading D_ITEMS from: {d_items_path}")
    d_items_df = pd.read_csv(d_items_path, usecols=['ITEMID', 'LABEL'])
    itemid_to_label.update(pd.Series(d_items_df.LABEL.values, index=d_items_df.ITEMID).to_dict())
    logger.info(f"Loading D_LABITEMS from: {d_labitems_path}")
    d_labitems_df = pd.read_csv(d_labitems_path, usecols=['ITEMID', 'LABEL'])
    itemid_to_label.update(pd.Series(d_labitems_df.LABEL.values, index=d_labitems_df.ITEMID).to_dict())
    logger.info(f"Created combined ITEMID->Label map with {len(itemid_to_label)} unique entries.")
except FileNotFoundError as e:
    logger.error(f"Error: MIMIC dictionary CSV not found - {e}")
    raise
except Exception as e:
    logger.error(f"Error loading or processing MIMIC dictionaries: {e}")
    raise

# --- 5. Process Tokens and Gather Information ---
logger.info("--- Generating Combined Token Information ---")
combined_data = []
binned_token_pattern = re.compile(r":(\d+)$") # Checks if suffix is purely numeric

sorted_indices = sorted(index2token.keys())

for index in sorted_indices:
    token_string = index2token[index]
    clinical_label = "<Special Token>"
    itemid_parsed = None
    value_range_str = ""  # For binned tokens
    possible_values_str = "" # For discrete/non-binned tokens
    token_type = "Special"

    if token_string not in ['<PAD>', '<UNK>']:
        try:
            # Get Clinical Label (Common Step)
            itemid_str = token_string.split('_')[0]
            itemid = int(itemid_str)
            itemid_parsed = itemid
            clinical_label = itemid_to_label.get(itemid, "<Unknown ITEMID>")

            # Check if token is binned
            match = binned_token_pattern.search(token_string)
            if match:
                # --- Handle Binned Token ---
                token_type = "Binned Continuous"
                bin_index = int(match.group(1))
                itemid_uom_key = token_string[:match.start()]

                if itemid_uom_key in bin_boundaries:
                    boundaries = bin_boundaries[itemid_uom_key]
                    if 0 <= bin_index < len(boundaries) - 1:
                        lower_bound = boundaries[bin_index]
                        upper_bound = boundaries[bin_index + 1]
                        value_range_str = f"[{lower_bound:.4f}, {upper_bound:.4f})"
                    else:
                        logger.warning(f"Token '{token_string}' has invalid bin index {bin_index}.")
                        value_range_str = "<Invalid Bin Index>"
                else:
                     logger.warning(f"Binned token '{token_string}' key '{itemid_uom_key}' not found in boundaries.")
                     value_range_str = "<Boundaries Not Found>"

            else:
                # --- Handle Discrete or Non-Binned Token ---
                token_type = "Discrete/Non-Binned"
                value_suffix = token_string.split(':')[-1] if ':' in token_string else '<No Value Suffix>'
                itemid_uom_key = token_string[:-(len(value_suffix)+1)] if ':' in token_string else token_string

                if itemid_uom_key in value_dictionary:
                    possible_disc_values = value_dictionary[itemid_uom_key].get('disc', [])
                    all_possible = sorted(list(set(map(str, possible_disc_values))))
                    if all_possible:
                         # Limit number of displayed values if too many
                         max_display = 20
                         display_values = all_possible[:max_display]
                         possible_values_str = ", ".join(display_values)
                         if len(all_possible) > max_display:
                             possible_values_str += f", ... ({len(all_possible) - max_display} more)"
                    else:
                         possible_values_str = "<No discrete values listed>"
                else:
                    possible_values_str = "<ITEMID_UOM not in values.npy>"

        except Exception as e:
            logger.warning(f"Error processing token '{token_string}': {e}")
            clinical_label = "<Processing Error>"
            token_type = "Error"

    # Append data for this token
    combined_data.append({
        'Index': index,
        'Token': token_string,
        'ITEMID': itemid_parsed if itemid_parsed is not None else '',
        'Label': clinical_label,
        'Type': token_type,
        'Value_Range (for Binned)': value_range_str,
        'Possible_Discrete_Values (from Training)': possible_values_str
    })

# --- 6. Create DataFrame and Save to CSV ---
try:
    logger.info(f"Creating combined DataFrame with {len(combined_data)} rows.")
    combined_df = pd.DataFrame(combined_data)

    # Sort DataFrame (optional)
    combined_df.sort_values(by=['Index'], inplace=True)

    logger.info(f"Saving combined token info to CSV: {output_csv_path}")
    combined_df.to_csv(output_csv_path, index=False, quoting=csv.QUOTE_ALL)
    logger.info("Successfully saved CSV.")

except Exception as e:
    logger.error(f"Error creating or saving combined DataFrame to CSV: {e}")
    raise

logger.info("--- Combined Token Info Generation Finished ---")

Binned token '220046_bpm:120' key '220046_bpm' not found in boundaries.
Binned token '220047_bpm:50' key '220047_bpm' not found in boundaries.
Binned token '220047_bpm:60' key '220047_bpm' not found in boundaries.
Binned token '220056_mmHg:80' key '220056_mmHg' not found in boundaries.
Binned token '220056_mmHg:90' key '220056_mmHg' not found in boundaries.
Binned token '220058_mmHg:160' key '220058_mmHg' not found in boundaries.
Token '220060_mmHg:16' has invalid bin index 16.
Token '220061_mmHg:19' has invalid bin index 19.
Token '220074_mmHg:17' has invalid bin index 17.
Token '220210_insp/min:17' has invalid bin index 17.
Binned token '220227_%:92' key '220227_%' not found in boundaries.
Binned token '220227_%:96' key '220227_%' not found in boundaries.
Binned token '220227_%:97' key '220227_%' not found in boundaries.
Binned token '220227_%:98' key '220227_%' not found in boundaries.
Binned token '220227_%:99' key '220227_%' not found in boundaries.
Token '220274_units:19' has inv