In [None]:
import os
import json
import pandas as pd
import re
from pymongo import MongoClient
from bson import ObjectId


def create_all_results_json():
    directory = os.getcwd()  # Get the current working directory
    output_file = os.path.join(directory, 'all_results.json')

    def read_json_files(directory):
        results = []
        seen_ids = set()  # To keep track of unique entries based on '_id'

        # Loop through all files in the directory
        for filename in os.listdir(directory):
            if filename.startswith("r_") and filename.endswith(".json"):
                filepath = os.path.join(directory, filename)
                with open(filepath, 'r', encoding='utf-8') as file:
                    try:
                        data = json.load(file)
                        if isinstance(data, dict):  # In case the data is a single dictionary
                            data = [data]
                        # Ensure unique entries based on '_id'
                        for entry in data:
                            if isinstance(entry, dict):  # Ensure each entry is a dictionary
                                _id = entry.get('_id')
                                if _id and _id not in seen_ids:
                                    seen_ids.add(_id)
                                    results.append(entry)
                    except json.JSONDecodeError:
                        print(f"Error decoding JSON from file: {filepath}")

        return results

    def save_to_json(data, output_file):
        with open(output_file, 'w', encoding='utf-8') as file:
            json.dump(data, file, ensure_ascii=False, indent=4)

    # Read the JSON files and combine the results
    combined_results = read_json_files(directory)

    # Save the combined results to a single JSON file
    save_to_json(combined_results, output_file)

    print(f"Combined results saved to {output_file}")

# Usage example to create all_results.json
create_all_results_json()




In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:

def load_all_results_json():
    directory = os.getcwd()  # Get the current working directory
    file_path = os.path.join(directory, 'all_results.json')
    input_json = None

    # Check if the file exists
    if os.path.exists(file_path):
        print("File exists. Proceeding to load.")
        # Load the JSON file
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                input_json = json.load(file)
            print("File loaded successfully.")
        except json.JSONDecodeError:
            print("Error decoding JSON file.")
    else:
        print(f"File not found: {file_path}")
    
    return input_json

# Usage example to load all_results.json
input_json = load_all_results_json()

# Optionally print the loaded JSON content to verify
# print(json.dumps(input_json, indent=2))


In [None]:
question_data_mapping = {
    "chiefcomplaint":      {"collection": "ED-Triage-prammed", "field": "chiefcomplaint"},
    "medication_reported": {"collection": "ED-Medrecon-prammed", "field": "name"},
    "medication_pyxis":    {"collection": "ED-Pyxis-prammed", "field": "name"},
    "diagnosis":           {"collection": "ED-Diagnosis-prammed", "field": "icd_title"},
    "bp":                  {"collection": ["ED-Triage-prammed", "ED-VitalSigns-prammed"], "fields": [("sbp", "dbp")], "format": "triage: {0} {1}; vitals: {2} {3}"},
    "heartrate":           {"collection": ["ED-Triage-prammed", "ED-VitalSigns-prammed"], "fields": ["heartrate"], "format": "triage: {0}; vitals: {1}"},
    "o2sat_triage":        {"collection": ["ED-Triage-prammed", "ED-VitalSigns-prammed"], "fields": ["o2sat"], "format": "triage: {0}; vitals: {1}"},
    "resprate":            {"collection": ["ED-Triage-prammed", "ED-VitalSigns-prammed"], "fields": ["resprate"], "format": "triage: {0}; vitals: {1}"},
    "temperature":         {"collection": ["ED-Triage-prammed", "ED-VitalSigns-prammed"], "fields": ["temperature"], "format": "triage: {0}; vitals: {1}"},
    "pain_triage":         {"collection": ["ED-Triage-prammed"], "fields": ["pain"], "format": "triage: {0}"}
}

In [None]:
# Other questions of interest
nominal_questions = [
    "chiefcomplaint",
    "medication_reported",
    "medication_pyxis",
    "diagnosis"
]

In [None]:
import pandas as pd
import json
import os

def create_basic_dataframe(input_json):
    rows = []

    for i, record in enumerate(input_json):
        record_id = record.get('_id', '')
        
        for question, answer in record['answers'].items():
            rows.append([i, record_id, question, answer, ''])
    
    df_basic = pd.DataFrame(rows, columns=['json_i', '_id', 'question', 'answer', 'data'])
    return df_basic

# Usage example

df_basic = create_basic_dataframe(input_json)
df_basic.head(1)

In [None]:
df_basic.info()

In [None]:
df_basic.info()

In [None]:
def format_data(question, json_data_used, question_data_mapping):
    data_info = question_data_mapping.get(question)
    if not data_info:
        return None

    if isinstance(data_info["collection"], list):
        values = []
        for field_pair in data_info["fields"]:
            if isinstance(field_pair, tuple):
                triage_values = [json_data_used.get("ED-Triage-prammed", {}).get(field, "nan") for field in field_pair]
                vitals_values = [json_data_used.get("ED-VitalSigns-prammed", {}).get(field, "nan") for field in field_pair]
                values.extend(triage_values + vitals_values)
            else:
                triage_value = json_data_used.get("ED-Triage-prammed", {}).get(field_pair, "nan")
                vitals_value = json_data_used.get("ED-VitalSigns-prammed", {}).get(field_pair, "nan")
                values.extend([triage_value, vitals_value])
        return data_info["format"].format(*values)
    else:
        collection_data = json_data_used.get(data_info["collection"], {})
        if collection_data is None:
            collection_data = {}
        return collection_data.get(data_info["field"], "nan")

def fill_data_column(df, input_json, question_data_mapping):
    for index, row in df.iterrows():
        question = row['question']
        json_i = row['json_i']
        record = input_json[json_i]
        
        data = format_data(question, record['json_data_used'], question_data_mapping)
        df.at[index, 'data'] = data if data is not None else 'nan'
    
    return df


In [None]:
df_filled = fill_data_column(df_basic, input_json, question_data_mapping)
df_filled.insert(df_filled.columns.get_loc('data') + 1, 'Class', '')


In [None]:
df_filled.head(2)

# Postprocessing with openai


In [None]:
df_unprocessed = df_filled.copy()
df_unprocessed.head(5)

In [None]:
# Für Openai aidocs kernel brauchen

In [None]:
import os
import openai
import pandas as pd
from dotenv import load_dotenv

In [None]:
# Load environment variables from a specified .env file for secure API key storage
env_path = '/home/msd4/aidocsMosi/openAI_Token.env'
load_dotenv(dotenv_path=env_path)

# Retrieve the OpenAI API key from the environment variables
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise ValueError("No API key found. Please set your OPENAI_API_KEY in the .env file.")

# Initialize the OpenAI client with the retrieved API key
openai.api_key = OPENAI_API_KEY

In [None]:
# Final
prompt_template_conditions_data = """
Please extract the relevant medical conditions from the following text and return a comma-separated list. Follow these rules:
- Convert all items to lower case.
- Write out abbreviations.
- Remove duplicates, keeping only one instance.
- If the text contains a specific subgroup (e.g., "Coffee Ground Emesis"), keep the original and also add the general terminology.
- Expand terms with multiple components into all relevant variations without losing medical context. For example:
  - "s/p Fall, L Hand injury, Head injury" should become "s/p fall, status post fall, l hand injury, left hand injury, hand injury, head injury".
  - "TETANUS TOXOID INOCULAT" should become "tetanus toxoid inoculation, tetanus inoculation, tetanus".
  - "Anemia, unspecified" should become "anemia unspecified, anemia".
  - "L Knee swelling, Abnormal labs" should become "l knee swelling, left knee swelling, knee swelling, abnormal labs, abnormal lab results".
  - "Long term (current) use of aspirin" should become "long term (current) use of aspirin, long term use of aspirin, use of aspirin, aspirin use, aspirin".
  - "Lower back pain, MVC, Neck pain" should become "lower back pain, motor vehicle accident, mvc, neck pain".
  - "Post-traumatic stress disorder, unspecified" should become "post-traumatic stress disorder, post traumatic stress disorder, traumatic stress disorder".

Here is the text:
'''
{text}
'''
"""

In [None]:
# Added "Do not list diagnoses or conditions that that are ruled out", because rag-llm tended to fill out max length with "ruled out"-conditions in some cases (2 seen)

prompt_template_conditions_answer = """
Please extract the relevant medical conditions, diagnosis and issues from the following text and return a comma-separated list. Follow these rules:
- Convert all items to lower case.
- Write out abbreviations where necessary.
- Remove single numbers and special characters.
- Do not use line breaks; keep everything in one line.
- Condense longer descriptions into single words or short phrases while retaining medical meaning.
- Correct any obvious spelling mistakes (e.g., "hest pain" to "chest pain").
- Keep general terms to ensure medical context but reduce unnecessary variations. For example:
  - "No previous health problems mentioned.\n- No persisting health problems mentioned.\n- Symptoms reported: severe pain (rated 7 on a scale of 0 to 10), started a few hours ago, getting worse." should become "no previous health problems, severe pain".
  - "Chest pain\n- Low blood pressure\n- Tachypnea (respiratory rate of 19 breaths per minute, which is higher than normal)\n- Bradycardia (heart rate of 67 beats per minute, which is lower than normal)\n- Possible hypotension (blood pressure of 121 over 80)" should become "chest pain, low blood pressure, tachypnea, bradycardia, possible hypotension".
  - "Unsteadiness on feet" should become "unsteadiness on feet, unsteadiness".
- Also list the wounds and injuries.
- List the content in parentheses also separately.
- Do not list diagnoses or conditions that that are ruled out.
Here is the text:
'''
{text}
'''
"""

In [None]:
prompt_template_drugs_data = """
Please extract the relevant drugs mentioned in the following text and return a comma-separated list in CSV format. Follow these rules:
- Clean all special characters, including dots.
- Convert all terms to lowercase.
- Ensure forms and dosages are not mentioned alone without the drug name or active ingredient.
- Keep the translated original term and append variations. Variations should include:
  - One original term with dosage.
  - One without dosage.
  - Only market label name if present in data but keep original label names.
  - Include the active ingredient name if present.
- For example:
  - "Lantus Solostar U-100 Insulin" should become "lantus solostar u-100 insulin, lantus solostar insulin, lantus solostar, lantus insulin, insulin".
  - "Morphin 2 tabletten" should become "morphine 2 tabs, morphine tabs, morphine".
  - "DiphenhydrAMINE 25mg CAP" should become "diphenhydramine 25mg cap, diphenhydramine 25mg capsules, diphenhydramine capsules, diphenhydramine 25mg, diphenhydramine".
  - "Oxycodone [OxyContin]" should become "oxycodone oxycontin, oxycodone, oxycontin".
  - "Vitamin D2" should become "vitamin d2, vit d2".
  - "MetRONIDAZOLE (Flagyl)" should become "metronidazole flagyl, metronidazole, flagyl".
  - "Tetanus-DiphTox-Acellular Pertuss" should become "tetanus-diphtox-acellular pertuss, tetanus diphtox acellular pertuss, tetanus diphteria pertussis vaccine, tetanus diphteria pertussis, tetanus vaccine, diphteria vaccine, pertussis vaccine".
  - "Ampicillin-Su 3g/100mL 100mL BAG" should become "ampicillin-su 3g/100ml 100ml bag, ampicillin-su 3g/100ml 100ml, ampicillin-su bag, ampicillin-su, ampicillin".
- Remove hyphens after keeping the original term.
- Remove duplicates from the final CSV list.
- Ensure the final list is comma-separated.

Here is the text:
'''
{text}
'''
"""

In [None]:
# Final
prompt_template_drugs_answer = """
Please extract the relevant drugs mentioned in the following text and return a comma-separated list in CSV format. Follow these rules:
- Clean all special characters, including dots.
- Convert all terms to lowercase.
- Ensure forms and dosages are not mentioned alone without the drug name or active ingredient.
- If an item has only dosage or form information (e.g., "2 tabs in the morning"), mark it as "2 tabs with no further information".
- Keep the translated original term and append variations within the same item. Variations should include:
  - One original term with dosage.
  - One without dosage.
  - Only market label name if present in data but keep original label names.
  - Include the active ingredient name if present.
- Variations should only be done within the same text-list-item and not combined with other items.
- Remove hyphens after keeping the original term.
- Remove duplicates from the final CSV list.
- Remove non-drug related information.
- If the text contains valid drug names without additional information, include them as they are.
- If no drug-related information is available, insert only: "no information at all".
- Ensure the final list is comma-separated.

Here is the text:
'''
{text}
'''
"""

In [None]:
def call_openai(text, template):
    prompt = template.format(text=text)
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7,
        max_tokens=150
    )
    return response.choices[0].message.content

def gpt_make_csv_array(text, template):
    response_text = call_openai(text, template)
    csv_array = [item.strip() for item in response_text.split(',')]
    print(response_text)
    return csv_array

def loop_gpt_make_csv(df, df_column_name, template, questions_to_process, start=0, end=None):
    df_input = df.copy()
    cell_of_column_to_process = df_column_name
    new_column_name = df_column_name + '_gpt_csv'

    if end is None:
        end = len(df_input)
        
    # Filter the dataframe based on the specified questions
    df_to_loop_through = df_input[df_input['question'].isin(questions_to_process)].iloc[start:end]

    # Initialize the new column with empty lists
    df_input[new_column_name] = [[] for _ in range(len(df_input))]

    for idx, row in df_to_loop_through.iterrows():
        text = str(row[cell_of_column_to_process])
        csv_array = gpt_make_csv_array(text, template)
        df_input.at[idx, new_column_name] = csv_array  # Assign the array directly

    return df_input


def loop_gpt_make_csv(df, df_column_name, template, questions_to_process, start=0, end=None):
    df_input = df.copy()
    cell_of_column_to_process = df_column_name
    new_column_name = 'data_gpt' if df_column_name == 'data' else 'answer_gpt'

    if new_column_name not in df_input.columns:
        # Initialize the new column with empty lists if it doesn't exist
        df_input[new_column_name] = [[] for _ in range(len(df_input))]

    if end is None:
        end = len(df_input)
        
    # Filter the dataframe based on the specified questions
    df_to_loop_through = df_input[df_input['question'].isin(questions_to_process)].iloc[start:end]

    for idx, row in df_to_loop_through.iterrows():
        if not row[new_column_name]:  # Only fill if the cell is empty
            text = str(row[cell_of_column_to_process])
            csv_array = gpt_make_csv_array(text, template)
            df_input.at[idx, new_column_name] = csv_array  # Assign the array directly

    return df_input

### Postprocess conditions

In [None]:
# Postprocess data
processed_df = df_unprocessed
questions_to_process = ['chiefcomplaint', 'diagnosis']
processed_df = loop_gpt_make_csv(processed_df, 'data', prompt_template_conditions_data, questions_to_process, start=0, end=None)

In [None]:
# Save to CSV or handle the DataFrame as needed
processed_df.to_csv('processed_output-conditions-data-gpt.csv', index=False)

In [None]:
# Now lets progress with postprossessing
#df_in = processed_df
questions_to_process = ['chiefcomplaint', 'diagnosis']
processed_df = loop_gpt_make_csv(processed_df, 'answer', prompt_template_conditions_answer, questions_to_process, start=0, end=None)

In [None]:
processed_df.info()

### postprocess medication

In [None]:
questions_to_process = ['medication_reported', 'medication_pyxis']
processed_df = loop_gpt_make_csv(processed_df, 'data', prompt_template_drugs_data, questions_to_process, start=0, end=None)

In [None]:
questions_to_process = ['medication_reported', 'medication_pyxis']
processed_df = loop_gpt_make_csv(processed_df, 'answer', prompt_template_drugs_answer, questions_to_process, start=0, end=None)

In [None]:
processed_df.head(1)

In [None]:
# Save to CSV or handle the DataFrame as needed
processed_df.to_csv('processed_df-completeGpt.csv', index=False)

In [None]:
#CHECK THE isnull, to see if postprocessing with gpt did work


# Check for empty cells in the DataFrame
empty_cells = processed_df.isnull() | (processed_df.applymap(lambda x: len(x) == 0 if isinstance(x, list) else False))

# Get rows with any empty cells
rows_with_empty_cells = processed_df[empty_cells.any(axis=1)]

# Get unique questions from those rows
unique_questions_with_empty_cells = rows_with_empty_cells['question'].unique()

# Print the unique questions
print('Those question where not affected by gpt postprocessing ')
print(unique_questions_with_empty_cells)

## Classification of nominals by cosine_similarity between gpt-postprocessed data and answers:

In [None]:
processed_df_done = processed_df

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load the BlueBERT tokenizer and model for embeddings
model_name = 'bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [None]:
def compute_embeddings(text):
    text = text.lower()  # Convert text to lowercase
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

def classify_nominals(df):
    classifications = []
    all_similarities = []
    cutoff_positive = 0.85
    cutoff_negative = 0.6  # Average cutoff for FN
    
    for index, row in df.iterrows():
        data_gpt_csv_embeddings = [compute_embeddings(item) for item in row['data_gpt']]
        answer_gpt_csv_embeddings = [compute_embeddings(item) for item in row['answer_gpt']]
        
        row_classification = "FP"
        row_similarities = []
        for data_embedding in data_gpt_csv_embeddings:
            max_similarity = 0
            for answer_embedding in answer_gpt_csv_embeddings:
                similarity = cosine_similarity([data_embedding], [answer_embedding])[0][0]
                row_similarities.append(similarity)
                if similarity > max_similarity:
                    max_similarity = similarity
                if similarity >= cutoff_positive:
                    row_classification = "TP"
                    break
            if row_classification == "TP":
                break
        
        # Calculate the average similarity score
        average_similarity = sum(row_similarities) / len(row_similarities) if row_similarities else 0
        if row_classification == "FP" and average_similarity <= cutoff_negative:
            row_classification = "FN"
        
        classifications.append(row_classification)
        all_similarities.append(row_similarities)
    
    df['Class'] = classifications
    df['Similarities'] = all_similarities
    return df


In [None]:
def compute_classification_metrics(df, question_types):
    """
    Filters the DataFrame for specified question types and computes classification metrics.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing the classification results.
    question_types (list): List of question types to filter the DataFrame.

    Returns:
    dict: A dictionary containing question types, TP count, FP count, FN count, precision, recall, and F1-score.
    """
    # Filter the DataFrame to only include rows with the specified question types
    filtered_df = df[df['question'].isin(question_types)]

    # Calculate counts for TP, FP, FN
    tp_count = (filtered_df['Class'] == 'TP').sum()
    fp_count = (filtered_df['Class'] == 'FP').sum()
    fn_count = (filtered_df['Class'] == 'FN').sum()

    # Calculate Precision, Recall, and F1-score
    precision = tp_count / (tp_count + fp_count) if (tp_count + fp_count) > 0 else 0
    recall = tp_count / (tp_count + fn_count) if (tp_count + fn_count) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # Round the precision, recall, and F1-score to 2 decimal places
    precision = round(precision, 2)
    recall = round(recall, 2)
    f1_score = round(f1_score, 2)

    # Create a dictionary with the results
    metrics = {
        'Question_types': question_types,
        'TP_count': tp_count,
        'FP_count': fp_count,
        'FN_count': fn_count,
        'Precision': precision,
        'Recall': recall,
        'F1_score': f1_score
    }

    return metrics

In [None]:
# UTILITY METHODS

import ast
import pandas as pd
import matplotlib.pyplot as plt
import os




def sort_to_lastscore(df, cls='TP', order='asc', questions=None):
    """
    Filter the DataFrame for a specific class and questions, and sort by the last score.

    Parameters:
    df (pd.DataFrame): The input DataFrame.
    cls (str): The class to filter by.
    order (str): The sort order ('asc' for ascending, 'desc' for descending).
    questions (list or str): The questions to filter by ('all' for all relevant questions).

    like:
    sorted_df = sort_to_lastscore(df, cls='TP', order='asc', questions=['chiefcomplaint', 'medication_reported'])
    sorted_df = sort_to_lastscore(df, cls='TP', order='asc', questions='all')

    Returns:
    pd.DataFrame: The sorted copy of the DataFrame.
    """
    if questions == 'all':
        questions = ['chiefcomplaint', 'medication_reported', 'medication_pyxis', 'diagnosis']
    elif isinstance(questions, str):
        questions = [questions]

    # Filter the DataFrame for the specified class and questions
    filtered_df = df[(df['Class'].str.upper() == cls.upper()) & (df['question'].isin(questions))]

    # Sort the DataFrame by 'last_score'
    sorted_df = filtered_df.copy().sort_values(by='last_score', ascending=(order == 'asc'))
    
    return sorted_df




def add_last_score_to_all_nominals(df):
    """
    Extract the last score from the 'Similarities' column for every row and insert it into a new column 'last_score'.

    Parameters:
    df (pd.DataFrame): The input DataFrame.

    Returns:
    pd.DataFrame: The DataFrame with an additional 'last_score' column.
    """
    def extract_last_score(similarities):
        try:
            # Convert the string representation to an actual list
            similarities_list = ast.literal_eval(similarities)
            # Return the last item of the list
            return similarities_list[-1] if similarities_list else float('nan')
        except (ValueError, SyntaxError):
            return float('nan')
    
    # Apply the function to extract the last score
    df['last_score'] = df['Similarities'].apply(extract_last_score)
    
    return df


def add_last_score_to_all_nominals(df):
    """
    Extract the last score from the 'Similarities' column for every row and insert it into a new column 'last_score'.

    Parameters:
    df (pd.DataFrame): The input DataFrame.

    Returns:
    pd.DataFrame: The DataFrame with an additional 'last_score' column.
    """
    def extract_last_score(similarities):
        try:
            # Convert the string representation to an actual list
            similarities_list = ast.literal_eval(similarities)
            # Return the last item of the list
            return similarities_list[-1] if similarities_list else float('nan')
        except (ValueError, SyntaxError):
            return float('nan')
    
    # Apply the function to extract the last score
    df['last_score'] = df['Similarities'].apply(extract_last_score)
    
    return df


def add_last_score_to_all_nominals(df):
    """
    Extract the last score from the 'Similarities' column for every row and insert it into a new column 'last_score'.

    Parameters:
    df (pd.DataFrame): The input DataFrame.

    Returns:
    pd.DataFrame: The DataFrame with an additional 'last_score' column.
    """
    def extract_last_score(similarities):
        if isinstance(similarities, list):
            return similarities[-1] if similarities else float('nan')
        return float('nan')
    
    # Apply the function to extract the last score
    df['last_score'] = df['Similarities'].apply(extract_last_score)
    
    return df




def plot_classification_scores(df, questions=None, cutoff=0.85):
    """
    Plot a scatter plot of classification scores with a cutoff line.

    Parameters:
    df (pd.DataFrame): The input DataFrame.
    questions (list or str): The questions to filter by ('all' for all relevant questions).
    cutoff (float): The cutoff line for the scores.
    """
    if questions == 'all':
        questions = ['chiefcomplaint', 'medication_reported', 'medication_pyxis', 'diagnosis']
    elif isinstance(questions, str):
        questions = [questions]

    # Filter the DataFrame for the specified questions
    filtered_df = df[df['question'].isin(questions)].copy()

    # Define markers and colors for each question and class
    question_markers = {
        'chiefcomplaint': 'o',
        'medication_reported': 's',
        'medication_pyxis': '^',
        'diagnosis': 'D'
    }

    class_colors = {
        'TP': 'green',
        'FP': 'darkorange',
        'FN': 'red'
    }

    # Plot the scatter plot
    plt.figure(figsize=(10, 6))
    for question in questions:
        question_df = filtered_df[filtered_df['question'] == question]
        plt.scatter(
            question_df['json_i'], 
            question_df['last_score'], 
            label=question, 
            marker=question_markers.get(question, 'o'), 
            c=question_df['Class'].map(class_colors),
            edgecolor='k',
            s=100
        )
    
    # Plot the cutoff line
    plt.axhline(y=cutoff, color='blue', linestyle='--', label=f'Cutoff = {cutoff}')
    
    # Add labels and legend
    plt.xlabel('Index')
    plt.ylabel('Last Score')
    plt.title('Classification Scores with Cutoff Line')
    plt.legend(title='Questions')
    plt.grid(True)
    plt.show()





def load_csv_to_df(file_name):
    """
    Load a CSV file into a Pandas DataFrame.

    Parameters:
    file_name (str): The name of the CSV file to load.

    Returns:
    pd.DataFrame: The loaded DataFrame.
    """
    # Get the current working directory
    current_dir = os.getcwd()
    
    # Construct the file path
    file_path = os.path.join(current_dir, file_name)
    
    # Load the CSV file into a DataFrame
    df = pd.read_csv(file_path)
    
    return df

    

def find_specific_rows_with_only_one_arrayitem(df, column_name, target_array):
    """
    Find rows where the specified column contains exactly the target array.

    Parameters:
    df (pd.DataFrame): The DataFrame to search.
    column_name (str): The column to search within.
    target_array (list): The array to match.

    Returns:
    pd.DataFrame: The DataFrame with matching rows.
    """
    # Define a function to check if the column value matches the target array
    def matches_target(value):
        try:
            # Convert the string representation to an actual list
            value_list = ast.literal_eval(value)
            return value_list == target_array
        except (ValueError, SyntaxError):
            return False
    
    # Find rows where the column matches the target array
    matching_rows = df[df[column_name].apply(matches_target)]
    
    return matching_rows

# Example usage if df is to be loaded from a csv that contains allready gpt preprocessing for nominals, if not done here
#classified_df = load_csv_to_df('df_nominal_classified-de.csv')
#classified_df = load_csv_to_df('df_nominal_classified-de.csv')
#classified_df.head(10)

#### classification with embeddings ______ 

In [None]:
# Classification into TP, FP, FN with COS.


classified_df = classify_nominals(processed_df)

#### Check for FN ['no information at all']______

In [None]:
matching_rows = find_specific_rows_with_only_one_arrayitem(classified_df, 'answer_gpt', ['no information at all'])
matching_rows.info()

In [None]:
matching_rows.head(1)

In [None]:


# nach inspektion mussten unter answer_gpt alle mit 'no information at all' von hand auf FN gesetzt werden, da kein cutoff gesetzt werden kann der klar ist. war bei en-dialogen nicht aufgefallen da sehr wenige.
df_copysave = classified_df.copy()
classified_df.loc[classified_df['answer_gpt'].apply(lambda x: ast.literal_eval(x) == ['no information at all']), 'Class'] = 'FN'

#### Calculate Metrics

In [None]:
def save_metrics_as_json(metrics, question_types):
    # Function to convert numpy data types to Python native types for JSON serialization
    def convert(o):
        if isinstance(o, np.int64): return int(o)  
        raise TypeError
    
    # Generate the file name based on the given question types
    file_name = "metrics_nominal-" + "_".join(question_types) + ".json"
    
    # Convert the metrics dictionary to a JSON string, using the convert function for non-serializable types
    metrics_json = json.dumps(metrics, indent=4, default=convert)
    
    # Write the JSON string to a file with the dynamically generated name
    with open(file_name, 'w') as file:
        file.write(metrics_json)
    
    print(f"Metrics saved to {file_name}")

In [None]:
# Specify the question types to filter
question_types = ['chiefcomplaint', 'diagnosis', 'medication_reported', 'medication_pyxis']
# Compute the classification metrics
metrics = compute_classification_metrics(classified_df, question_types)
save_metrics_as_json(metrics, question_types)
metrics

In [None]:
# Specify the question types to filter
question_types = ['chiefcomplaint', 'diagnosis', 'medication_reported', 'medication_pyxis']
# Compute the classification metrics
metrics = compute_classification_metrics(classified_df, question_types)
save_metrics_as_json(metrics, question_types)
metrics

In [None]:
# Specify the question types to filter
question_types = ['chiefcomplaint', 'diagnosis']
# Compute the classification metrics
metrics = compute_classification_metrics(classified_df, question_types)
save_metrics_as_json(metrics, question_types)
metrics

In [None]:
# Specify the question types to filter
question_types = ['medication_reported', 'medication_pyxis']
# Compute the classification metrics
metrics = compute_classification_metrics(classified_df, question_types)
save_metrics_as_json(metrics, question_types)
metrics

In [None]:
# Specify the question types to filter
question_types = ['diagnosis']
# Compute the classification metrics
metrics = compute_classification_metrics(classified_df, question_types)
save_metrics_as_json(metrics, question_types)
metrics

In [None]:
# Specify the question types to filter
question_types = ['chiefcomplaint']
# Compute the classification metrics
metrics = compute_classification_metrics(classified_df, question_types)
save_metrics_as_json(metrics, question_types)
metrics

In [None]:
# Specify the question types to filter
question_types = ['medication_reported']
# Compute the classification metrics
metrics = compute_classification_metrics(classified_df, question_types)
save_metrics_as_json(metrics, question_types)
metrics

In [None]:
# Specify the question types to filter
question_types = ['medication_pyxis']
# Compute the classification metrics
metrics = compute_classification_metrics(classified_df, question_types)
save_metrics_as_json(metrics, question_types)
metrics

In [None]:
# Save to CSV or handle the DataFrame as needed
classified_df.to_csv('df_nominal_classified.csv', index=False)

# Inspection

To visually inspect the results.

In [None]:
classified_df.head(1)

In [None]:
# Check the Class and Scores to see if the Set cutoff is ok
classified_df_lastscored= add_last_score_to_all_nominals(classified_df)
classified_df_lastscored.head(2)


In [None]:
plot_classification_scores(classified_df_lastscored, questions='all')
# Save the plot as a PNG file
questions_str = '_'.join(['all'])
filename = f'nominal_score_cutoff_{questions_str}.png'
plt.savefig(filename, bbox_inches='tight')
print(f'Plot saved as {filename}')

In [None]:
plot_classification_scores(classified_df_lastscored, questions='chiefcomplaint')
questions_str = '_'.join(['chiefcomplaint'])
filename = f'nominal_score_cutoff_{questions_str}.png'
plt.savefig(filename, bbox_inches='tight')
print(f'Plot saved as {filename}')

In [None]:
plot_classification_scores(classified_df_lastscored, questions='medication_reported')
questions_str = '_'.join(['medication_reported'])
filename = f'nominal_score_cutoff_{questions_str}.png'
plt.savefig(filename, bbox_inches='tight')
print(f'Plot saved as {filename}')

In [None]:
plot_classification_scores(classified_df_lastscored, questions='medication_pyxis')
questions_str = '_'.join(['medication_pyxis'])
filename = f'nominal_score_cutoff_{questions_str}.png'
plt.savefig(filename, bbox_inches='tight')
print(f'Plot saved as {filename}')

In [None]:
plot_classification_scores(classified_df_lastscored, questions='diagnosis')
questions_str = '_'.join(['diagnosis'])
filename = f'nominal_score_cutoff_{questions_str}.png'
plt.savefig(filename, bbox_inches='tight')
print(f'Plot saved as {filename}')

In [None]:
# Inspect if the lower TP are still reasonable and confirming the set cutoff, or that cutoff must be changed.
#sorted_df_tp = sort_to_lastscore(classified_df_lastscored, cls='TP', order='asc', questions=['chiefcomplaint', 'medication_reported'])
#sorted_df_fp = sort_to_lastscore(classified_df_lastscored, cls='FP', order='desc', questions='all')
#sorted_df_fp = sort_to_lastscore(classified_df_lastscored, cls='FP', order='desc', questions=['diagnosis'])


sorted_df_fp_diagnosis = sort_to_lastscore(classified_df_lastscored, cls='FP', order='desc', questions=['diagnosis'])
sorted_df_fp_diagnosis.head(10)

In [None]:
sorted_df_tp_diagnosis = sort_to_lastscore(classified_df_lastscored, cls='TP', order='asc', questions=['diagnosis'])
sorted_df_tp_diagnosis.head(20)

# Numerical-Question Evaluation (like bp: 162 / 80):

In [None]:
#classified_df_numerical = classified_df.copy()
classified_df_numerical =df_filled.copy()
classified_df_numerical.head(10)

In [None]:
# Questions of interest
numerical_questions = [
    "bp",
    "heartrate",
    "o2sat_triage",
    "resprate",
    "temperature",
    "pain_triage"
]

In [None]:
import json

def save_numerical_metric(metrics, file_name='numerical_metrics.json'):
    """
    Saves the provided metrics dictionary into a JSON file.

    Args:
    metrics (dict): The metrics dictionary containing tp, fp, fn, precision, recall, f1_score, instances.
    file_name (str): The name of the file to save the JSON data. Default is 'numerical_metrics.json'.
    """
    # Convert the metrics dictionary to a JSON string
    metrics_json = json.dumps(metrics, indent=4)
    
    # Write the JSON string to a file with the specified or dynamically generated name
    with open(file_name, 'w') as file:
        file.write(metrics_json)
    
    print(f"Numerical metrics saved to {file_name}")

In [None]:
import re

numerical_questions = [
    "bp",
    "heartrate",
    "o2sat_triage",
    "resprate",
    "temperature",
    "pain_triage"
]

def classify_numerical_data(df_numerical):
    tp, fp, fn = 0, 0, 0
    instances = 0
    classifications = []

    def is_match(data_value, answer):
        if data_value % 1 == 0:
            pattern = r'\b{}\b'.format(int(data_value))
            return re.search(pattern, answer) is not None
        else:
            patterns = [
                r'\b{}\b'.format(int(data_value)),
                r'\b{}\b'.format(round(data_value)),
                r'\b{}\b'.format(round(data_value - 0.5)),
                r'\b{}\b'.format(round(data_value + 0.5))
            ]
            for pattern in patterns:
                if re.search(pattern, answer):
                    return True
            return False

    for index, row in df_numerical.iterrows():
        data_values = re.findall(r'\d+\.?\d*', str(row['data']))
        if not data_values:
            continue
        
        data_values = list(map(float, data_values))
        instances += len(data_values)
        answer = str(row['answer'])
        row_classifications = []

        answer_values = re.findall(r'\d+\.?\d*', answer)
        if not answer_values:
            fn += len(data_values)
            row_classifications.extend(["FN"] * len(data_values))
        else:
            for data_value in data_values:
                if is_match(data_value, answer):
                    tp += 1
                    row_classifications.append("TP")
                else:
                    fp += 1
                    row_classifications.append("FP")

        classifications.append(row_classifications)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    df_numerical_classified = df_numerical.copy()
    df_numerical_classified['Class'] = ["; ".join(cls) for cls in classifications]

    cols = df_numerical_classified.columns.tolist()
    data_index = cols.index('data')
    cols.insert(data_index + 1, cols.pop(cols.index('Class')))
    df_numerical_classified = df_numerical_classified[cols]

    metrics = {
        'tp': tp,
        'fp': fp,
        'fn': fn,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'instances': instances
    }

    return metrics, df_numerical_classified


# Example usage
df_numerical = df_filled[df_filled['question'].isin(numerical_questions)]
metrics, df_numerical_classified = classify_numerical_data(df_numerical)
save_numerical_metric(metrics)
print(metrics)
#df_numerical_classified.head(50)

In [None]:
# This one if df stems from first nominal classification

#classified_df_numerical = classified_df_numerical[classified_df_numerical['question'].isin(numerical_questions)]
#df_numerical_classified = classified_df_numerical
#metrics, classified_df_numerical = classify_numerical_data(classified_df_numerical)

In [None]:
df_numerical_classified.head(10)

In [None]:
import pandas as pd
from datetime import datetime
# Get current date
current_date = datetime.now().strftime("%Y-%m-%d")

# Create file name with date
file_name = f"classified_df_numerical-{current_date}-de.csv"

# Save dataframe to CSV
classified_df_numerical.to_csv(file_name, index=False)

## Save Nominal Total as CSV for savekeeping

In [None]:
df_nominal_total = pd.concat([df_nominal_drugs_classified, df_nominal_conditions_classified], ignore_index=True)

In [None]:
df_nominal_total.info()

In [None]:
# Get current date
current_date = datetime.now().strftime("%Y-%m-%d")

# Create file name with date
file_name = f"df_nominal_total-{current_date}.csv"

# Save dataframe to CSV
df_nominal_total.to_csv(file_name, index=False)

In [None]:
def calculate_metrics(df):
    tp = sum(df['Class'] == "TP")
    fp = sum(df['Class'] == "FP")
    fn = sum(df['Class'] == "FN")
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    metrics = {
        'tp': tp,
        'fp': fp,
        'fn': fn,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }
    
    return metrics

# Calculate metrics for the combined DataFrame
metrics_nominal_total = calculate_metrics(df_nominal_total)

# Print the metrics
metrics_nominal_total