In [2]:
######################################## Check Environment ########################################
import sys
print("Python executable:", sys.executable)
print("Python version:", sys.version)

Python executable: /opt/homebrew/anaconda3/envs/venv_mitsui_condapy310/bin/python
Python version: 3.10.15 | packaged by conda-forge | (main, Oct 16 2024, 01:24:20) [Clang 17.0.6 ]


In [3]:
######################################## Install packages ########################################

import pandas as pd
import numpy as np
# Create chunks
import re

# Model for NER
import spacy 
from sklearn.cluster import KMeans
import medspacy
from medspacy.ner import TargetRule
from thefuzz import fuzz, process

#UMLSClient for NER
import umls_api
from umls_api_client import UMLS
from quickumls import QuickUMLS

# Use natural language processing (NLP) to extract keywords from the criteria
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')
from sentence_transformers import SentenceTransformer, util

# Performance
import sklearn
from sklearn.metrics import cohen_kappa_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

import snowflake.connector

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danageorge/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/danageorge/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/danageorge/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
######################################## Connect to Snowflake ########################################

# Establish a connection
conn = snowflake.connector.connect(
    user='dana_george@hakkoda.io',
    authenticator='externalbrowser',
    account='ska04930.east-us-2.azure',
    warehouse='DATASCIENCE_WH',
    database='ONCOEMR_RAW_DEV',
    schema='DBO',
    role='ACCOUNTADMIN'
)

# Run a test query
cursor = conn.cursor()
cursor.execute("SELECT CURRENT_VERSION()")
row = cursor.fetchone()
print("Snowflake version:", row[0])

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...
Going to open: https://login.microsoftonline.com/bbaf0319-e615-416f-8870-f7eacf074b66/saml2?SAMLRequest=nZJBc5swEIX%2FCqOeAQkoxBrbGdeetNRO4tq4nfQmw%2BJoDBKRREj66yuwPZMekkNvGuntfk%2F7dnz9UlfOMyjNpZgg4mHkgMhlwcVhgnbZjXuFHG2YKFglBUzQK2h0PR1rVlcNnbXmUWzgqQVtHNtIaNo%2FTFCrBJVMc00Fq0FTk9Pt7HZFAw9TpjUoY3HoXFJoblmPxjTU97uu87rQk%2BrgBxhjH498q%2Boln9AbRPMxo1HSyFxWl5IX%2B6d3EMTHUY%2BwCktYnwu%2FcHEawUeU%2FUmk6bcsW7vr%2B22GnNnld3MpdFuD2oJ65jnsNquTAW0d6CPD0SjEHjBt3Fa7gcf%2BtAo8LWRXVuwIuayb1tjmnj35JRR%2BJQ%2FcjixdTFBz5IUisFvOH5L0YXW4N1%2FjYLWLqtvNkmTfTfsUhCpdxUmyXP7qfuTI%2BXkJOOgDTrVuIRV9rMZe4SByCXEDkpGQRiHFIy8eRb%2BRs7CxcsHMUHnxPvjwap4rqWVppKi4gMHlfs9KHJKRCzH57EYkLt2rqwS7ZQIsL3ES7ePY78ML0GmB6GBETf9zLGP%2FbZPzSt7ZlNLFWlY8f3VupKqZeT9E4pHhhhduOUgp1IxXs6JQoLUNs6pkN1fAjN18o1

In [5]:
######################################## Load Data ########################################

# Get sample patient ids
cursor = conn.cursor()
cursor.execute("""
    SELECT DISTINCT "patientid" FROM ONCOEMR_RAW_DEV.DBO.DEMOGRAHPICS
    ORDER BY RANDOM()
""")
sample_patient_ids = [row[0] for row in cursor.fetchall()]
sample_patient_ids = [f"'{id}'" if isinstance(id, str) else str(id) for id in sample_patient_ids]

cursor = conn.cursor()
cursor.execute("""
    SELECT table_name
    FROM information_schema.tables
    WHERE table_schema = 'DBO'
    AND table_catalog = 'ONCOEMR_RAW_DEV'
    AND table_type = 'BASE TABLE';
""")

# Fetch all the table names
tables = [row[0] for row in cursor.fetchall()]
#print(tables)

# Create a dictionary to hold each table as a DataFrame
table_dataframes = {}
table_dataframes_spat = {}

for table in tables:
    # First, check if the table contains 'patientid' by querying the columns of the table
    cursor.execute(f"""
        SELECT column_name
        FROM information_schema.columns
        WHERE table_schema = 'DBO' AND table_name = '{table}'
    """)
    
    columns = [row[0] for row in cursor.fetchall()]
    
    # If 'patientid' is a column, proceed to query the table
    if 'patientid' in columns:
        query = f"""
            SELECT *
            FROM ONCOEMR_RAW_DEV.DBO.{table}
            WHERE "patientid" IN ({', '.join(map(str, sample_patient_ids))})
        """
        cursor.execute(query)
        
        # Fetch the result and convert it to a DataFrame
        results = cursor.fetchall()
        columns = [desc[0] for desc in cursor.description]
        table_dataframes[table] = pd.DataFrame(results, columns=columns)

    # If 'spatientid' is a column, proceed to query the table
    if 'spatientid' in columns:
        query = f"""
            SELECT *
            FROM ONCOEMR_RAW_DEV.DBO.{table}
            WHERE "spatientid" IN ({', '.join(map(str, sample_patient_ids))})
        """
        cursor.execute(query)
        
        # Fetch the result and convert it to a DataFrame
        results_spat = cursor.fetchall()
        columns_spat = [desc[0] for desc in cursor.description]
        table_dataframes_spat[table] = pd.DataFrame(results_spat, columns=columns_spat)

# Merge table_dataframes_spat into table_dataframes_pat
table_dataframes.update(table_dataframes_spat)

# Now table_dataframes_pat contains all the tables from both dictionaries
print("Data Loaded Successfully!")
print(" ")
print("Tables Loaded:")
for table, df in table_dataframes.items():
    print(f"{table}")
    #print(df.head())

# Loop through the dictionary to create a separate DataFrame variable for each key
for table, df in table_dataframes.items():
    globals()[table] = df

# Now you can access the DataFrames as individual variables:
# print(ADMINISTRATIONS.head())

# Bring in clinical trial data
clinical_trials_excl = pd.read_csv('clinical_trials_data_simple_exclusion.csv')
clinical_trials_incl = pd.read_csv('agegender_incl1.csv')
print("clinical_trials_data_simple_exclusion")
print("clinical_trials_data_simple_inclusion")
print(" ")

def print_columns_of_dict_of_dfs(df_dict):
    """Prints the columns of each DataFrame in a dictionary of DataFrames."""

    for df_name, df in df_dict.items():
        print(f"Columns of {df_name}:")
        print(df.columns)
        print("-" * 20)

# Call the function to print the columns
print_columns_of_dict_of_dfs(table_dataframes)

Data Loaded Successfully!
 
Tables Loaded:
ADMINISTRATIONS
ADVANCEDIRECTIVES
ALLERGY
CHARGE
DEMOGRAHPICS
DEMOGRAPHICS
DIAGNOSIS
DISEASESTATUS
ERX
FAMILYHISTORY
HOSPITALIZATION
INSURANCE
LABS
ORDERS
RADIOLOGY
REFERRINGPROVIDER
SOCIALHISTORY
TRANSFUSION
GRADESCALES
SURGICALHISTORY
PERFORMANCE
VISIT
BIOMARKERS
TOXICITIES
MEDICATIONLIST
STAGING
DATA_HISTORY
PATIENT_LOCATION_HISTORY
ORDER_CHARGE_HISTORY
TREATMENT_CURRENT_HISTORY
VITAL_SIGN_HISTORY
TREATMENT_PREVIOUS_HISTORY
clinical_trials_data_simple_exclusion
clinical_trials_data_simple_inclusion
 
Columns of ADMINISTRATIONS:
Index(['clientid', 'administrationid', 'diagnosisid', 'doseadministered',
       'doseapproved', 'drugname', 'duration', 'intent', 'endreason', 'form',
       'targetdrugname', 'targetdrugshortname', 'targetdrugcategory', 'ndc',
       'nodosestaken', 'orderedamount', 'ordereddate', 'administeredunits',
       'targetadministeredunits', 'orderid', 'patientid', 'plannedcycles',
       'providerid', 'orderhassignoff', 

In [6]:
######################################## Feature Engineering ########################################

# Convert non-numeric values to NaN
DEMOGRAPHICS['age'] = pd.to_numeric(DEMOGRAPHICS['age'], errors='coerce')

# Now, convert the column to integers (NaNs will remain as NaN)
DEMOGRAPHICS['age'] = DEMOGRAPHICS['age'].fillna(-1).astype(int)  
print("Feature Engineering Complete!")

Feature Engineering Complete!


In [7]:
######################################## Quality Check ########################################
print(clinical_trials_incl.columns)
print(DEMOGRAPHICS)

Index(['Trial_Name', 'Trial_ID', 'Inclusion_Criteria'], dtype='object')
      RowID clientid                             patientid patientmrn  \
0    112304   CA0026  D6288764-EBAB-429D-9D6E-BA5152340FDD       None   
1    158902   CA0026  548CD51B-9AC0-4A25-9D48-B6D08675DD00       None   
2       924   CA0026  A552A6C5-63DB-4152-B0E3-4C1A94C9CF27       None   
3    109938   CA0026  ECCE7682-F192-4E4B-8915-D3808F81E60E       None   
4    131886   CA0026  B995B177-DA99-4BEF-9818-8E40DCD9841D       None   
..      ...      ...                                   ...        ...   
995     148   CA0026  9D1DAE72-E47F-453C-AF06-12AA2B97855C       None   
996     228   CA0026  E393B5A7-862D-43DD-97A2-4B2075FBD297       None   
997     398   CA0026  F90AA3C5-D2D7-4A6E-AFC5-024A1F912112       None   
998     878   CA0026  BE19FCC3-3107-400D-9C0B-1A6C3ADB8D5E       None   
999     275   CA0026  F124E3E2-C129-4FA2-8144-BF11CEC44CFB       None   

    patientssn         primaryphysicianid   dob  bi

In [8]:
######################################## Build Mock Expert Decision ########################################
# Perform the LEFT JOIN
merged_df = DEMOGRAPHICS.merge(DIAGNOSIS, on='patientid', how='left')

# Filter using "LIKE" equivalent
eligible = merged_df[
    (merged_df['age'] >= 18) &
    (merged_df['gender'] == 'Female')
    # (merged_df['targetdetaileddiagnosisgroup'].str.contains('breast', case=False, na=False)) &
    # (merged_df['targetdetaileddiagnosisgroup'].str.contains('cancer', case=False, na=False))
]

# # For evaluation metrics later
# eligible['Expert_Decision_Age'] = 1
# eligible['Expert_Decision_Gender'] = 1
eligible['Expert_Decision'] = 1

#distinct_count = eligible['patientid'].nunique()

# Extract patient IDs that match the expert's eligibility criteria
eligible_patient_ids = eligible['patientid'].unique().tolist()

# Get patient IDs that are not in the eligible list
ineligible_patient_ids = merged_df[~merged_df['patientid'].isin(eligible_patient_ids)]['patientid'].unique().tolist()

print(len(eligible_patient_ids))
print(len(ineligible_patient_ids))

print(eligible)

579
420
       RowID clientid_x                             patientid patientmrn  \
10    158902     CA0026  548CD51B-9AC0-4A25-9D48-B6D08675DD00       None   
11    158902     CA0026  548CD51B-9AC0-4A25-9D48-B6D08675DD00       None   
12    158902     CA0026  548CD51B-9AC0-4A25-9D48-B6D08675DD00       None   
13    158902     CA0026  548CD51B-9AC0-4A25-9D48-B6D08675DD00       None   
14    158902     CA0026  548CD51B-9AC0-4A25-9D48-B6D08675DD00       None   
...      ...        ...                                   ...        ...   
4370     439     CA0026  9B4AB207-5DE3-43ED-8267-16C8FE7409A8       None   
4373     123     CA0026  D0BF1727-EB9C-4D6C-86AF-2BA142AC3ABA       None   
4375     593     CA0026  87F5D3B1-7EBB-48C9-ACD4-7AB4D211701D       None   
4379     148     CA0026  9D1DAE72-E47F-453C-AF06-12AA2B97855C       None   
4382     878     CA0026  BE19FCC3-3107-400D-9C0B-1A6C3ADB8D5E       None   

     patientssn         primaryphysicianid   dob  birthyear  age   dod  ...  \


In [9]:
eligible.to_csv('test_eligible.csv', index=False)

In [10]:
# ######################################## Use Medspacy: Create Entity/Label Pairs in Inclusion Criteria - 1 line of text, model testing ########################################

# ### Test to apply to 1 line of text
# ### Create Inclusion Criteria Categories to be used later in column matching/finding

# # Load the MedSpaCy model
# nlp = spacy.load('en_ner_bc5cdr_md')

# # Process your text
# text = "The patient is a female 18 years old and was diagnosed with breast cancer and prescribed Tamoxifen."

# # Function to extract entities and labels
# def extract_entities(text):
#     doc = nlp(text)
#     entities = [(ent.text, ent.label_) for ent in doc.ents]

#     # Custom check for age-related information (e.g., "18 years old")
#     age_pattern = r'\b(?:aged|over|under|above|below)?\s*(\d+)\s*(?:years? old|yrs?|yo)?\b'
#     age_matches = re.findall(age_pattern, text, re.IGNORECASE)
    
#     # If age-related information is found, add it to the entities with the correct label
#     for age in age_matches:
#         entities.append((f"{age} years old", 'AGE'))
    
#     # Custom check for gender-related information (e.g., "Male", "Female")
#     gender_keywords = ['female', 'male']  # We only need to check for 'female' and 'male'
    
#     # Check for the first gender-related term match (female first, then male)
#     gender_found = False
#     for gender in gender_keywords:
#         match = re.search(r'\b' + gender + r'\b', text, re.IGNORECASE)
#         if match:
#             entities.append((match.group(), 'GENDER'))
#             break  # Once a match is found, stop further checking

#     return entities

# # Display named entities and custom additions
# entities = extract_entities(text)
# for ent in entities:
#     print(f"Entity: {ent[0]}, Label: {ent[1]}")

Entity: breast cancer, Label: DISEASE
Entity: Tamoxifen, Label: CHEMICAL
Entity: 18 years old, Label: AGE
Entity: female, Label: GENDER


In [9]:
######################################## Use Medspacy: Create Entity/Label Pairs in Inclusion Criteria - Clinical Trial Dataframe ########################################

### Apply to a dataframe of trial data
### Create Inclusion Criteria Categories to be used later in column matching/finding

# Load the MedSpaCy model
nlp = spacy.load('en_ner_bc5cdr_md')

# Function to extract entities and labels
def extract_entities(text):
    # Process the text through the NLP model
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]

    # Custom check for age-related information (e.g., "18 years old")
    age_pattern = r'\b(?:aged|over|under|above|below)?\s*(\d+)\s*(?:years? old|yrs?|yo)?\b'
    age_matches = re.findall(age_pattern, text, re.IGNORECASE)
    
    # If age-related information is found, add it to the entities with the correct label
    for age in age_matches:
        entities.append((f"{age[0]} years old", 'AGE'))
    
    # Custom check for gender-related information (e.g., "Male", "Female")
    gender_keywords = ['female', 'male']  # We only need to check for 'female' and 'male'
    
    # Check for the first gender-related term match (female first, then male)
    gender_found = False
    for gender in gender_keywords:
        match = re.search(r'\b' + gender + r'\b', text, re.IGNORECASE)
        if match:
            entities.append((match.group(), 'GENDER'))
            break  # Once a match is found, stop further checking

    # Extract the unique labels to avoid duplicates and return them
    unique_labels = set([label for _, label in entities])
    return list(unique_labels)

# Apply the function to the inclusion_criteria column and create a new 'Category' column
clinical_trials_incl['Category'] = clinical_trials_incl['Inclusion_Criteria'].apply(lambda x: ', '.join(extract_entities(x)))

# Display the updated DataFrame
print(clinical_trials_incl)

   Trial_Name  Trial_ID          Inclusion_Criteria Category
0  Test_Trial       123            Aged 18 or over.      AGE
1  Test_Trial       123                      Female   GENDER
2  Test_Trial       123  Diagnosed with Lung Cancer  DISEASE


In [10]:
######################################## Use Fuzzy: Find columns in Patient Data that match Trial Inclusion Criteria ########################################

# Function to find exact matches and fuzzy matches
def find_matching_columns(category, dict_of_dfs, fuzzy_threshold=80):
    if category.lower() == 'disease':
        return ['DIAGNOSIS.targetdetaileddiagnosisgroup']
    
    # Step 1: Find exact matches (case-insensitive)
    exact_matches = []
    for df_name, df in dict_of_dfs.items():
        if category.lower() in [col.lower() for col in df.columns]:
            exact_column = next(col for col in df.columns if col.lower() == category.lower())
            exact_matches.append(f'{df_name}.{exact_column}')
            return exact_matches  # Return immediately after finding an exact match
    
    # Step 2: If no exact match, find fuzzy matches
    fuzzy_matches = []
    for df_name, df in dict_of_dfs.items():
        columns = df.columns
        for column in columns:
            score = process.extractOne(category, [column])  # Compare category with each column
            if score and score[1] >= fuzzy_threshold:  # If score is above threshold
                fuzzy_matches.append(f'{df_name}.{column}')
    
    return fuzzy_matches

# Loop through the clinical_trials_incl DataFrame and apply matching function
def add_source_columns(clinical_trials_incl, table_dataframes):
    source_columns_list = []
    
    for index, row in clinical_trials_incl.iterrows():
        category = row['Category']
        matching_columns = find_matching_columns(category, table_dataframes)
        
        # If there are multiple matches, list them, else return 'No match'
        if matching_columns:
            source_columns_list.append(', '.join(matching_columns))
        else:
            source_columns_list.append('No match')
    
    clinical_trials_incl['Source_Columns'] = source_columns_list
    return clinical_trials_incl

# Apply the function to the clinical_trials_incl DataFrame
clinical_trials_incl_ner = add_source_columns(clinical_trials_incl, table_dataframes)

# Display the updated DataFrame
print(clinical_trials_incl_ner)

######################### Now the clinical trial data is ready. #########################

   Trial_Name  Trial_ID          Inclusion_Criteria Category  \
0  Test_Trial       123            Aged 18 or over.      AGE   
1  Test_Trial       123                      Female   GENDER   
2  Test_Trial       123  Diagnosed with Lung Cancer  DISEASE   

                           Source_Columns  
0                        DEMOGRAHPICS.age  
1                     DEMOGRAHPICS.gender  
2  DIAGNOSIS.targetdetaileddiagnosisgroup  


In [45]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd

# Load RoBERTa model and tokenizer
model_name = "roberta-base"  # You can also use "bert-base-uncased" or "xlnet-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def calculate_similarity_transformer(inclusion_criteria, source_value):
    """
    Calculate similarity between two text strings using RoBERTa model embeddings.
    """
    if pd.isna(inclusion_criteria) or pd.isna(source_value) or not inclusion_criteria.strip() or not source_value.strip():
        #print(f"Skipping empty input: inclusion_criteria = {inclusion_criteria}, source_value = {source_value}")
        return 0  # If either string is empty, return 0 as match percentage
    
    #print(f"Comparing: {inclusion_criteria} vs {source_value}")
    
    # Tokenize both inclusion_criteria and source_value separately
    inputs_inclusion = tokenizer(inclusion_criteria, return_tensors="pt", padding=True, truncation=True)
    inputs_source = tokenizer(source_value, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        # Get embeddings for both input texts
        embeddings_inclusion = model(**inputs_inclusion).last_hidden_state.mean(dim=1)  # Pool the embeddings
        embeddings_source = model(**inputs_source).last_hidden_state.mean(dim=1)  # Pool the embeddings
    
    # Print embeddings shape for debugging
    #print(f"Inclusion Embeddings shape: {embeddings_inclusion.shape}")
    #print(f"Source Embeddings shape: {embeddings_source.shape}")
    
    # If embeddings are empty, return 0
    if embeddings_inclusion.numel() == 0 or embeddings_source.numel() == 0:
        print("Empty embeddings detected!")
        return 0

    # Cosine similarity between the two embeddings
    similarity = torch.nn.functional.cosine_similarity(embeddings_inclusion, embeddings_source)
    return round(similarity.item() * 100, 2)

def match_patients_to_trials(clinical_trials_incl_ner, table_dataframes):
    """
    Match patients to clinical trials using RoBERTa for similarity calculation.
    """
    results = []

    for _, row in clinical_trials_incl_ner.iterrows():
        trial_name = row['Trial_Name']
        trial_id = row['Trial_ID']
        inclusion_criteria = row['Inclusion_Criteria']
        category = row['Category']
        source_column = row['Source_Columns']
        
        table_name, column_name = source_column.split('.')
        
        # Check if the table exists in the provided dataframes
        if table_name in table_dataframes:
            df = table_dataframes[table_name]
            if column_name in df.columns:
                for _, patient_row in df.iterrows():
                    patient_id = patient_row['patientid']
                    source_value = patient_row[column_name]

                    # Use RoBERTa for all categories (no need to parse age/gender)
                    match_percentage = calculate_similarity_transformer(inclusion_criteria, source_value)

                    # Append results
                    results.append({
                        'Patient_ID': patient_id,
                        'Trial_Name': trial_name,
                        'Trial_ID': trial_id,
                        'Inclusion_Criteria': inclusion_criteria,
                        'Category': category,
                        'Source_Column': source_column,
                        'Source_Value': source_value,
                        'Match_Percentage': match_percentage
                    })

    return pd.DataFrame(results)

# Run Matching
matched_results = match_patients_to_trials(clinical_trials_incl_ner, table_dataframes)

# Display the result
print(matched_results)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


                                Patient_ID  Trial_Name  Trial_ID  \
0     D6288764-EBAB-429D-9D6E-BA5152340FDD  Test_Trial       123   
1     548CD51B-9AC0-4A25-9D48-B6D08675DD00  Test_Trial       123   
2     A552A6C5-63DB-4152-B0E3-4C1A94C9CF27  Test_Trial       123   
3     ECCE7682-F192-4E4B-8915-D3808F81E60E  Test_Trial       123   
4     B995B177-DA99-4BEF-9818-8E40DCD9841D  Test_Trial       123   
...                                    ...         ...       ...   
6027  C95B62FB-36CC-405C-A4BD-F419053188BB  Test_Trial       123   
6028  7F7E43E7-4438-4A9A-A11B-299DDAB8DB58  Test_Trial       123   
6029  6876BBD9-29FE-4B06-947B-D7EC22355212  Test_Trial       123   
6030  67EEC590-6EDC-4A20-8D00-490754E25110  Test_Trial       123   
6031  DC49B6B7-027A-4416-BA1B-922031859121  Test_Trial       123   

              Inclusion_Criteria Category  \
0               Aged 18 or over.      AGE   
1               Aged 18 or over.      AGE   
2               Aged 18 or over.      AGE   
3  

In [42]:
# Step 1: Extract the age and gender information from Source_Column and Source_Value
age_data = matched_results[matched_results['Source_Column'] == 'DEMOGRAHPICS.age'].drop_duplicates()

# Convert 'Source_Value' to numeric for age data, coerce errors to NaN if there are non-numeric entries
age_data['Source_Value'] = pd.to_numeric(age_data['Source_Value'], errors='coerce')

# First, merge age data
age_merged = pd.merge(age_data, eligible[['patientid', 'age', 'Expert_Decision']], 
                       left_on=['Patient_ID', 'Source_Value'], right_on=['patientid', 'age'], 
                       how='inner').drop_duplicates()

# Display the merged dataframe with relevant columns
age_merged = age_merged[['Patient_ID', 'Trial_Name', 'Trial_ID', 'Inclusion_Criteria', 'Category', 'Source_Column', 'Source_Value', 'Match_Percentage', 'Expert_Decision']]
print(age_merged)

                                Patient_ID  Trial_Name  Trial_ID  \
0     548CD51B-9AC0-4A25-9D48-B6D08675DD00  Test_Trial       123   
14    A552A6C5-63DB-4152-B0E3-4C1A94C9CF27  Test_Trial       123   
34    D7498EED-DFEB-4B1D-AF0F-AAABD6F97760  Test_Trial       123   
43    6C4D35A4-465F-49C5-B4FE-744CBF5D2AB7  Test_Trial       123   
66    0274040C-DCE0-4217-9821-2B7318E5F8D1  Test_Trial       123   
...                                    ...         ...       ...   
2750  9B4AB207-5DE3-43ED-8267-16C8FE7409A8  Test_Trial       123   
2751  D0BF1727-EB9C-4D6C-86AF-2BA142AC3ABA  Test_Trial       123   
2752  87F5D3B1-7EBB-48C9-ACD4-7AB4D211701D  Test_Trial       123   
2753  9D1DAE72-E47F-453C-AF06-12AA2B97855C  Test_Trial       123   
2754  BE19FCC3-3107-400D-9C0B-1A6C3ADB8D5E  Test_Trial       123   

     Inclusion_Criteria Category     Source_Column  Source_Value  \
0      Aged 18 or over.      AGE  DEMOGRAHPICS.age          61.0   
14     Aged 18 or over.      AGE  DEMOGRAHPICS.

In [48]:
# Step 1: Extract the age and gender information from Source_Column and Source_Value
gender_data = matched_results[matched_results['Source_Column'] == 'DEMOGRAHPICS.gender'].drop_duplicates()

# Convert 'Source_Value' to string to align with 'gender' column in 'eligible'
gender_data['Source_Value'] = gender_data['Source_Value'].astype(str)

# Ensure 'gender' column in 'eligible' is also string (if necessary)
eligible['gender'] = eligible['gender'].astype(str)

# First, merge age data
gender_merged = pd.merge(gender_data, eligible[['patientid', 'gender', 'Expert_Decision']], 
                       left_on=['Patient_ID', 'Source_Value'], right_on=['patientid', 'gender'], 
                       how='inner').drop_duplicates()

# Display the merged dataframe with relevant columns
gender_merged = gender_merged[['Patient_ID', 'Trial_Name', 'Trial_ID', 'Inclusion_Criteria', 'Category', 'Source_Column', 'Source_Value', 'Match_Percentage', 'Expert_Decision']]
print(gender_merged)

                                Patient_ID  Trial_Name  Trial_ID  \
0     548CD51B-9AC0-4A25-9D48-B6D08675DD00  Test_Trial       123   
14    A552A6C5-63DB-4152-B0E3-4C1A94C9CF27  Test_Trial       123   
34    D7498EED-DFEB-4B1D-AF0F-AAABD6F97760  Test_Trial       123   
43    6C4D35A4-465F-49C5-B4FE-744CBF5D2AB7  Test_Trial       123   
66    0274040C-DCE0-4217-9821-2B7318E5F8D1  Test_Trial       123   
...                                    ...         ...       ...   
2750  9B4AB207-5DE3-43ED-8267-16C8FE7409A8  Test_Trial       123   
2751  D0BF1727-EB9C-4D6C-86AF-2BA142AC3ABA  Test_Trial       123   
2752  87F5D3B1-7EBB-48C9-ACD4-7AB4D211701D  Test_Trial       123   
2753  9D1DAE72-E47F-453C-AF06-12AA2B97855C  Test_Trial       123   
2754  BE19FCC3-3107-400D-9C0B-1A6C3ADB8D5E  Test_Trial       123   

     Inclusion_Criteria Category        Source_Column Source_Value  \
0                Female   GENDER  DEMOGRAHPICS.gender       Female   
14               Female   GENDER  DEMOGRAHP

In [49]:
gender_merged.to_csv('gender_merged.csv', index=False)

In [20]:
######################################## Evaluate by Inclusion Criteria ########################################

matched_results_agegender = matched_results[matched_results['Category'].isin(['AGE', 'GENDER'])]

# Loop through all unique categories in the 'Category' column
for category in matched_results_agegender['Category'].unique():
    # Filter matched results for the current category
    matched_results_category = matched_results_agegender[matched_results_agegender['Category'] == category]
    
    # Copy the filtered data for further evaluation
    eval_result_df_category = matched_results_category.copy()
    
    # Apply model decision (based on match percentage)
    eval_result_df_category['Model_Decision'] = eval_result_df_category['Match_Percentage'].apply(lambda x: 1 if x > 70 else 0)
    
    # Get the true values for evaluation (assuming 'Expert_Decision_Disease' exists in the data)
    y_true_category = eval_result_df_category['Expert_Decision']
    y_pred_category = eval_result_df_category['Model_Decision']
    
    # Calculate confusion matrix and Cohen's Kappa score
    conf_matrix_category = confusion_matrix(y_true_category, y_pred_category)
    kappa_score_category = cohen_kappa_score(y_true_category, y_pred_category)
    
    # Print results for the current category
    print(f"Results for Category: {category}")
    print(f"\nConfusion Matrix:")
    print(conf_matrix_category)
    print("\nCohen's Kappa Score:", kappa_score_category)
    
    # Print summary of eligible and ineligible patients
    eligible_patient_ids = eval_result_df_category[eval_result_df_category['Model_Decision'] == 1]
    ineligible_patient_ids = eval_result_df_category[eval_result_df_category['Model_Decision'] == 0]
    
    print(f"\nPatients Eligible: {len(eligible_patient_ids)}")
    print(f"Patients Ineligible: {len(ineligible_patient_ids)}")
    print("\n" + "="*50 + "\n")

KeyError: 'Expert_Decision'