In [1]:
######################################## Check Environment ########################################
import sys
print("Python executable:", sys.executable)
print("Python version:", sys.version)

Python executable: /opt/homebrew/anaconda3/envs/venv_mitsui_condapy310/bin/python
Python version: 3.10.15 | packaged by conda-forge | (main, Oct 16 2024, 01:24:20) [Clang 17.0.6 ]


In [2]:
######################################## Install packages ########################################

import pandas as pd
import numpy as np
# Create chunks
import re

# Model for NER
import spacy 
from sklearn.cluster import KMeans
import medspacy
from medspacy.ner import TargetRule
from thefuzz import fuzz, process

#UMLSClient for NER
import umls_api
from umls_api_client import UMLS
from quickumls import QuickUMLS

# Use natural language processing (NLP) to extract keywords from the criteria
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')
from sentence_transformers import SentenceTransformer, util

# Performance
import sklearn
from sklearn.metrics import cohen_kappa_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

import snowflake.connector

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danageorge/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/danageorge/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/danageorge/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
######################################## Connect to Snowflake ########################################

# Establish a connection
conn = snowflake.connector.connect(
    user='dana_george@hakkoda.io',
    authenticator='externalbrowser',
    account='ska04930.east-us-2.azure',
    warehouse='DATASCIENCE_WH',
    database='ONCOEMR_RAW_DEV',
    schema='DBO',
    role='ACCOUNTADMIN'
)

# Run a test query
cursor = conn.cursor()
cursor.execute("SELECT CURRENT_VERSION()")
row = cursor.fetchone()
print("Snowflake version:", row[0])

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...
Going to open: https://login.microsoftonline.com/bbaf0319-e615-416f-8870-f7eacf074b66/saml2?SAMLRequest=nZJBc9owEIX%2Fikc925aNY4MGyJAQpnSSlIJhJr0JWyYqsmS0cgz59ZUNzKSH5NCbRnq739O%2BHd4eS%2BG8MQ1cyREKPIwcJjOVc7kboXU6c%2FvIAUNlToWSbIRODNDteAi0FBWZ1OZVLtmhZmAc20gCaR9GqNaSKAociKQlA2Iyspo8PZLQw4QCMG0sDl1KcuCW9WpMRXy%2FaRqv6XlK7%2FwQY%2BzjgW9VreQb%2BoCovmZUWhmVKXEtOdo%2FfYIIfBy1CKuwhMWl8I7L8wi%2BomzPIiDf03ThLn6uUuRMrr%2B7VxLqkukV0288Y%2Bvl49kAWAewpzga9LDHKBi3Bjf06HutmQdSNYWge5apsqqNbe7Zk1%2Bw3Bdqx%2B3I5tMRqvY8p9ndZpMuZ2sRH36Uv2awgMlp%2B%2FTwJwvlKjoNnl%2FmL4PkWB74Q4aczTXgsA14DlCzuWxjNfYKh5EbBG6I06BHgoTcxF6U4N%2FImdpYuaSmq7x673x4Jc%2B0AlUYJQWXrHO53dIC94KBy%2BLgxo2CuHD7%2FQS7RcJoVuAk2sax34YXovMCkc6IHv%2FnWIb%2BxyaXlXy2Kc2nCyV4dnJmSpfUfB5i4AXdDc%2FdopMSVlIuJnmuGYANUw

In [4]:
######################################## Load Data ########################################

# Get sample patient ids
cursor = conn.cursor()
cursor.execute("""
    SELECT DISTINCT "patientid" FROM ONCOEMR_RAW_DEV.DBO.DEMOGRAHPICS
    ORDER BY RANDOM()
""")
sample_patient_ids = [row[0] for row in cursor.fetchall()]
sample_patient_ids = [f"'{id}'" if isinstance(id, str) else str(id) for id in sample_patient_ids]

cursor = conn.cursor()
cursor.execute("""
    SELECT table_name
    FROM information_schema.tables
    WHERE table_schema = 'DBO'
    AND table_catalog = 'ONCOEMR_RAW_DEV'
    AND table_type = 'BASE TABLE';
""")

# Fetch all the table names
tables = [row[0] for row in cursor.fetchall()]
#print(tables)

# Create a dictionary to hold each table as a DataFrame
table_dataframes = {}
table_dataframes_spat = {}

for table in tables:
    # First, check if the table contains 'patientid' by querying the columns of the table
    cursor.execute(f"""
        SELECT column_name
        FROM information_schema.columns
        WHERE table_schema = 'DBO' AND table_name = '{table}'
    """)
    
    columns = [row[0] for row in cursor.fetchall()]
    
    # If 'patientid' is a column, proceed to query the table
    if 'patientid' in columns:
        query = f"""
            SELECT *
            FROM ONCOEMR_RAW_DEV.DBO.{table}
            WHERE "patientid" IN ({', '.join(map(str, sample_patient_ids))})
        """
        cursor.execute(query)
        
        # Fetch the result and convert it to a DataFrame
        results = cursor.fetchall()
        columns = [desc[0] for desc in cursor.description]
        table_dataframes[table] = pd.DataFrame(results, columns=columns)

    # If 'spatientid' is a column, proceed to query the table
    if 'spatientid' in columns:
        query = f"""
            SELECT *
            FROM ONCOEMR_RAW_DEV.DBO.{table}
            WHERE "spatientid" IN ({', '.join(map(str, sample_patient_ids))})
        """
        cursor.execute(query)
        
        # Fetch the result and convert it to a DataFrame
        results_spat = cursor.fetchall()
        columns_spat = [desc[0] for desc in cursor.description]
        table_dataframes_spat[table] = pd.DataFrame(results_spat, columns=columns_spat)

# Merge table_dataframes_spat into table_dataframes_pat
table_dataframes.update(table_dataframes_spat)

# Now table_dataframes_pat contains all the tables from both dictionaries
print("Data Loaded Successfully!")
print(" ")
print("Tables Loaded:")
for table, df in table_dataframes.items():
    print(f"{table}")
    #print(df.head())

# Loop through the dictionary to create a separate DataFrame variable for each key
for table, df in table_dataframes.items():
    globals()[table] = df

# Now you can access the DataFrames as individual variables:
# print(ADMINISTRATIONS.head())

# Bring in clinical trial data
clinical_trials_excl = pd.read_csv('clinical_trials_data_simple_exclusion.csv')
clinical_trials_incl = pd.read_csv('clinical_trials_data_simple_inclusion.csv')
print("clinical_trials_data_simple_exclusion")
print("clinical_trials_data_simple_inclusion")
print(" ")

def print_columns_of_dict_of_dfs(df_dict):
    """Prints the columns of each DataFrame in a dictionary of DataFrames."""

    for df_name, df in df_dict.items():
        print(f"Columns of {df_name}:")
        print(df.columns)
        print("-" * 20)

# Call the function to print the columns
print_columns_of_dict_of_dfs(table_dataframes)

Data Loaded Successfully!
 
Tables Loaded:
ADMINISTRATIONS
ADVANCEDIRECTIVES
ALLERGY
CHARGE
DEMOGRAHPICS
DEMOGRAPHICS
DIAGNOSIS
DISEASESTATUS
ERX
FAMILYHISTORY
HOSPITALIZATION
INSURANCE
LABS
ORDERS
RADIOLOGY
REFERRINGPROVIDER
SOCIALHISTORY
TRANSFUSION
GRADESCALES
SURGICALHISTORY
PERFORMANCE
VISIT
BIOMARKERS
TOXICITIES
MEDICATIONLIST
STAGING
DATA_HISTORY
PATIENT_LOCATION_HISTORY
ORDER_CHARGE_HISTORY
TREATMENT_CURRENT_HISTORY
VITAL_SIGN_HISTORY
TREATMENT_PREVIOUS_HISTORY
clinical_trials_data_simple_exclusion
clinical_trials_data_simple_inclusion
 
Columns of ADMINISTRATIONS:
Index(['clientid', 'administrationid', 'diagnosisid', 'doseadministered',
       'doseapproved', 'drugname', 'duration', 'intent', 'endreason', 'form',
       'targetdrugname', 'targetdrugshortname', 'targetdrugcategory', 'ndc',
       'nodosestaken', 'orderedamount', 'ordereddate', 'administeredunits',
       'targetadministeredunits', 'orderid', 'patientid', 'plannedcycles',
       'providerid', 'orderhassignoff', 

In [5]:
######################################## Feature Engineering ########################################

# Convert non-numeric values to NaN
DEMOGRAPHICS['age'] = pd.to_numeric(DEMOGRAPHICS['age'], errors='coerce')

# Now, convert the column to integers (NaNs will remain as NaN)
DEMOGRAPHICS['age'] = DEMOGRAPHICS['age'].fillna(-1).astype(int)  
print("Feature Engineering Complete!")

Feature Engineering Complete!


In [6]:
######################################## Quality Check ########################################
print(clinical_trials_incl.columns)
print(DEMOGRAPHICS)

Index(['Trial_Name', 'Trial_ID', 'Inclusion_Criteria'], dtype='object')
      RowID clientid                             patientid patientmrn  \
0    112304   CA0026  D6288764-EBAB-429D-9D6E-BA5152340FDD       None   
1    158902   CA0026  548CD51B-9AC0-4A25-9D48-B6D08675DD00       None   
2       924   CA0026  A552A6C5-63DB-4152-B0E3-4C1A94C9CF27       None   
3    109938   CA0026  ECCE7682-F192-4E4B-8915-D3808F81E60E       None   
4    131886   CA0026  B995B177-DA99-4BEF-9818-8E40DCD9841D       None   
..      ...      ...                                   ...        ...   
995     148   CA0026  9D1DAE72-E47F-453C-AF06-12AA2B97855C       None   
996     228   CA0026  E393B5A7-862D-43DD-97A2-4B2075FBD297       None   
997     398   CA0026  F90AA3C5-D2D7-4A6E-AFC5-024A1F912112       None   
998     878   CA0026  BE19FCC3-3107-400D-9C0B-1A6C3ADB8D5E       None   
999     275   CA0026  F124E3E2-C129-4FA2-8144-BF11CEC44CFB       None   

    patientssn         primaryphysicianid   dob  bi

In [16]:
######################################## Build Mock Expert Decision ########################################
# Perform the LEFT JOIN
merged_df = DEMOGRAPHICS.merge(DIAGNOSIS, on='patientid', how='left')

# Filter using "LIKE" equivalent
eligible = merged_df[
    (merged_df['age'] >= 18) &
    (merged_df['gender'] == 'Female') &
    (merged_df['targetdetaileddiagnosisgroup'].str.contains('lung', case=False, na=False)) &
    (merged_df['targetdetaileddiagnosisgroup'].str.contains('cancer', case=False, na=False))
]

# For evaluation metrics later
eligible['Expert_Decision_Age'] = 1
eligible['Expert_Decision_Gender'] = 1
eligible['Expert_Decision_Disease'] = 1

# Extract patient IDs that match the expert's eligibility criteria
eligible_patient_ids = eligible['patientid'].unique().tolist()

# Get patient IDs that are not in the eligible list
ineligible_patient_ids = merged_df[~merged_df['patientid'].isin(eligible_patient_ids)]['patientid'].unique().tolist()

print(len(eligible_patient_ids))
print(len(ineligible_patient_ids))

print(eligible)

17
982
       RowID clientid_x                             patientid patientmrn  \
118    94641     CA0026  294CAE76-8964-46C5-8FED-9971C96E25D4       None   
177   154570     CA0026  2D6D7E4D-9B77-4D4A-9A13-722DFB96BE29       None   
211   126824     CA0026  EDCC0EB8-53BA-45D5-B739-24BC3608FC28       None   
753      685     CA0026  C3BC10E4-EB63-4878-AFA8-DC52A4F0C041       None   
906      211     CA0026  AC2793B2-5427-4200-A07E-1006E87B0418       None   
1088     678     CA0026  86F96AD3-1CD5-4B67-8543-CE8D95ABB14D       None   
1304  186589     CA0026  1175F541-E77F-422C-AF11-3DFA8DC0F1B3       None   
1330     269     CA0026  CCF52C0F-833D-4011-95BE-AABD74C8C655       None   
1370     235     CA0026  26F3A31D-B1C2-46BD-8782-339F26A9762D       None   
2074     500     CA0026  93DC6A44-9C30-4BA6-A9C9-07094F4777DC       None   
2117      93     CA0026  12CC7584-E140-404F-A045-0F8532837A9C       None   
2499     220     CA0026  3DA798A9-E309-45A6-92A6-BBD5910F3D6B       None   
2556 

In [8]:
######################################## Use Medspacy: Create Entity/Label Pairs in Inclusion Criteria - 1 line of text, model testing ########################################

### Test to apply to 1 line of text
### Create Inclusion Criteria Categories to be used later in column matching/finding

# Load the MedSpaCy model
nlp = spacy.load('en_ner_bc5cdr_md')

# Process your text
text = "The patient is a female 18 years old and was diagnosed with breast cancer and prescribed Tamoxifen."

# Function to extract entities and labels
def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]

    # Custom check for age-related information (e.g., "18 years old")
    age_pattern = r'\b(?:aged|over|under|above|below)?\s*(\d+)\s*(?:years? old|yrs?|yo)?\b'
    age_matches = re.findall(age_pattern, text, re.IGNORECASE)
    
    # If age-related information is found, add it to the entities with the correct label
    for age in age_matches:
        entities.append((f"{age} years old", 'AGE'))
    
    # Custom check for gender-related information (e.g., "Male", "Female")
    gender_keywords = ['female', 'male']  # We only need to check for 'female' and 'male'
    
    # Check for the first gender-related term match (female first, then male)
    gender_found = False
    for gender in gender_keywords:
        match = re.search(r'\b' + gender + r'\b', text, re.IGNORECASE)
        if match:
            entities.append((match.group(), 'GENDER'))
            break  # Once a match is found, stop further checking

    return entities

# Display named entities and custom additions
entities = extract_entities(text)
for ent in entities:
    print(f"Entity: {ent[0]}, Label: {ent[1]}")

Entity: breast cancer, Label: DISEASE
Entity: Tamoxifen, Label: CHEMICAL
Entity: 18 years old, Label: AGE
Entity: female, Label: GENDER


In [9]:
######################################## Use Medspacy: Create Entity/Label Pairs in Inclusion Criteria - Clinical Trial Dataframe ########################################

### Apply to a dataframe of trial data
### Create Inclusion Criteria Categories to be used later in column matching/finding

# Load the MedSpaCy model
nlp = spacy.load('en_ner_bc5cdr_md')

# Function to extract entities and labels
def extract_entities(text):
    # Process the text through the NLP model
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]

    # Custom check for age-related information (e.g., "18 years old")
    age_pattern = r'\b(?:aged|over|under|above|below)?\s*(\d+)\s*(?:years? old|yrs?|yo)?\b'
    age_matches = re.findall(age_pattern, text, re.IGNORECASE)
    
    # If age-related information is found, add it to the entities with the correct label
    for age in age_matches:
        entities.append((f"{age[0]} years old", 'AGE'))
    
    # Custom check for gender-related information (e.g., "Male", "Female")
    gender_keywords = ['female', 'male']  # We only need to check for 'female' and 'male'
    
    # Check for the first gender-related term match (female first, then male)
    gender_found = False
    for gender in gender_keywords:
        match = re.search(r'\b' + gender + r'\b', text, re.IGNORECASE)
        if match:
            entities.append((match.group(), 'GENDER'))
            break  # Once a match is found, stop further checking

    # Extract the unique labels to avoid duplicates and return them
    unique_labels = set([label for _, label in entities])
    return list(unique_labels)

# Apply the function to the inclusion_criteria column and create a new 'Category' column
clinical_trials_incl['Category'] = clinical_trials_incl['Inclusion_Criteria'].apply(lambda x: ', '.join(extract_entities(x)))

# Display the updated DataFrame
print(clinical_trials_incl)

   Trial_Name  Trial_ID          Inclusion_Criteria Category
0  Test_Trial       123            Aged 18 or over.      AGE
1  Test_Trial       123                      Female   GENDER
2  Test_Trial       123  Diagnosed with Lung Cancer  DISEASE


In [10]:
######################################## Use Fuzzy: Find columns in Patient Data that match Trial Inclusion Criteria ########################################

# Function to find exact matches and fuzzy matches
def find_matching_columns(category, dict_of_dfs, fuzzy_threshold=80):
    # Step 1: Find exact matches (case-insensitive)
    exact_matches = []
    for df_name, df in dict_of_dfs.items():
        if category.lower() in [col.lower() for col in df.columns]:
            exact_column = next(col for col in df.columns if col.lower() == category.lower())
            exact_matches.append(f'{df_name}.{exact_column}')
            return exact_matches  # Return immediately after finding an exact match
    
    # Step 2: If no exact match, find fuzzy matches
    fuzzy_matches = []
    for df_name, df in dict_of_dfs.items():
        columns = df.columns
        for column in columns:
            score = process.extractOne(category, [column])  # Compare category with each column
            if score and score[1] >= fuzzy_threshold:  # If score is above threshold
                fuzzy_matches.append(f'{df_name}.{column}')
    
    return fuzzy_matches

# Loop through the clinical_trials_incl DataFrame and apply matching function
def add_source_columns(clinical_trials_incl, table_dataframes):
    source_columns_list = []
    
    for index, row in clinical_trials_incl.iterrows():
        category = row['Category']
        matching_columns = find_matching_columns(category, table_dataframes)
        
        # If there are multiple matches, list them, else return 'No match'
        if matching_columns:
            source_columns_list.append(', '.join(matching_columns))
        else:
            source_columns_list.append('No match')
    
    clinical_trials_incl['Source_Columns'] = source_columns_list
    return clinical_trials_incl

# Apply the function to the clinical_trials_incl DataFrame
updated_df = add_source_columns(clinical_trials_incl, table_dataframes)

# Display the updated DataFrame
print(updated_df)

   Trial_Name  Trial_ID          Inclusion_Criteria Category  \
0  Test_Trial       123            Aged 18 or over.      AGE   
1  Test_Trial       123                      Female   GENDER   
2  Test_Trial       123  Diagnosed with Lung Cancer  DISEASE   

        Source_Columns  
0     DEMOGRAHPICS.age  
1  DEMOGRAHPICS.gender  
2         LABS.disease  


In [181]:
######################################## Use Fuzzy: Calculate Patient Match Percentage by Inclusion Criteria ########################################

def match_patients_to_criteria(updated_df, table_dataframes):
    results = []

    for _, row in updated_df.iterrows():
        trial_name = row['Trial_Name']
        trial_id = row['Trial_ID']
        inclusion_criteria = row['Inclusion_Criteria']
        category = row['Category']
        source_column = row['Source_Columns']
        
        table_name, column_name = source_column.split('.')
        
        if table_name in table_dataframes:
            df = table_dataframes[table_name]
            
            if column_name in df.columns:
                for _, patient_row in df.iterrows():
                    patient_id = patient_row['patientid']
                    source_value = patient_row[column_name]
                    
                    match_percentage = calculate_match_percentage(inclusion_criteria, source_value, category)
                    
                    results.append({
                        'Patient_ID': patient_id,
                        'Trial_Name': trial_name,
                        'Trial_ID': trial_id,
                        'Inclusion_Criteria': inclusion_criteria,
                        'Category': category,
                        'Source_Column': source_column,
                        'Source_Value': source_value,
                        'Match_Percentage': match_percentage
                    })

    return pd.DataFrame(results)

def calculate_match_percentage(criteria, value, category):
    if criteria is None or value is None:
        return 0

    if category == 'AGE':
        try:
            age_limit = int(criteria.split()[1])
            patient_age = int(value)
            return 100 if patient_age >= age_limit else 0
        except (ValueError, IndexError):
            return 0
    elif category == 'GENDER':
        return 100 if str(criteria).lower() == str(value).lower() else 0
    elif category == 'DISEASE':
        return fuzz.token_set_ratio(str(criteria), str(value))
    else:
        return fuzz.token_set_ratio(str(criteria), str(value))

result_df = match_patients_to_criteria(updated_df, table_dataframes)
result_df['Expert_Decision_Incl'] = result_df['Patient_ID'].isin(eligible_patient_ids).astype(int)
print(result_df)

                                  Patient_ID  Trial_Name  Trial_ID  \
0       D6288764-EBAB-429D-9D6E-BA5152340FDD  Test_Trial       123   
1       548CD51B-9AC0-4A25-9D48-B6D08675DD00  Test_Trial       123   
2       A552A6C5-63DB-4152-B0E3-4C1A94C9CF27  Test_Trial       123   
3       ECCE7682-F192-4E4B-8915-D3808F81E60E  Test_Trial       123   
4       B995B177-DA99-4BEF-9818-8E40DCD9841D  Test_Trial       123   
...                                      ...         ...       ...   
264290  A2391F9A-1D24-4FAD-B624-93BD6165ED92  Test_Trial       123   
264291  A2391F9A-1D24-4FAD-B624-93BD6165ED92  Test_Trial       123   
264292  A2391F9A-1D24-4FAD-B624-93BD6165ED92  Test_Trial       123   
264293  A2391F9A-1D24-4FAD-B624-93BD6165ED92  Test_Trial       123   
264294  A2391F9A-1D24-4FAD-B624-93BD6165ED92  Test_Trial       123   

                Inclusion_Criteria Category     Source_Column Source_Value  \
0                 Aged 18 or over.      AGE  DEMOGRAHPICS.age           79   
1  

In [182]:
######################################## Downloand Patient Level Match Percentage by Inclusion Criteria ########################################

# Save the DataFrame to a CSV file
result_df.to_csv('result_df_1.csv', index=False) 

In [183]:
######################################## Inspect Patient Level Match Percentage by Inclusion Criteria ########################################

# pd.set_option('display.max_columns', None)  # or 1000
# pd.set_option('display.max_rows', 100)  # or 1000
# pd.set_option('display.max_colwidth', None)  # or 199

# # Filter rows where a specific column contains a value
# #filtered_df = result_df[result_df['Category'] == 'GENDER']
# filtered_df = result_df[result_df['Source_Value'].str.contains('lung', case=False, na=False)]

# # Print the first 10 rows of the filtered DataFrame
# print(filtered_df.head(10))

                                Patient_ID  Trial_Name  Trial_ID  \
2095  06299E62-F5B6-4574-B0EE-F944074D6C2B  Test_Trial       123   
2096  06299E62-F5B6-4574-B0EE-F944074D6C2B  Test_Trial       123   
2120  294CAE76-8964-46C5-8FED-9971C96E25D4  Test_Trial       123   
2152  294CAE76-8964-46C5-8FED-9971C96E25D4  Test_Trial       123   
2171  294CAE76-8964-46C5-8FED-9971C96E25D4  Test_Trial       123   
2207  294CAE76-8964-46C5-8FED-9971C96E25D4  Test_Trial       123   

              Inclusion_Criteria Category Source_Column  \
2095  Diagnosed with Lung Cancer  DISEASE  LABS.disease   
2096  Diagnosed with Lung Cancer  DISEASE  LABS.disease   
2120  Diagnosed with Lung Cancer  DISEASE  LABS.disease   
2152  Diagnosed with Lung Cancer  DISEASE  LABS.disease   
2171  Diagnosed with Lung Cancer  DISEASE  LABS.disease   
2207  Diagnosed with Lung Cancer  DISEASE  LABS.disease   

                            Source_Value  Match_Percentage  \
2095  Thorax: Non-Small Cell Lung Cancer       

In [188]:
######################################## Evaluate Patient Level Match Percentage by Inclusion Criteria ########################################

eval_result_all = result_df.copy()
eval_result_all['Model_Decision'] = eval_result_all['Match_Percentage'].apply(lambda x: 1 if x > 50 else 0)

# Calculate overall confusion matrix and kappa score
y_true_overall = eval_result_all['Expert_Decision_Incl']
y_pred_overall = eval_result_all['Model_Decision']

conf_matrix_overall = confusion_matrix(y_true_overall, y_pred_overall)
kappa_score_overall = cohen_kappa_score(y_true_overall, y_pred_overall)

# Print overall results
# Print summary of eligible and ineligible patients
print(f"Patients Eligible: {len(eligible_patient_ids)}")
print(f"Patients Ineligible: {len(ineligible_patient_ids)}")
print("\nOverall Results:")
print("\nConfusion Matrix:")
print(conf_matrix_overall)
print("Cohen's Kappa Score:", kappa_score_overall)
print("\n" + "="*50 + "\n")

# Calculate for each category
categories = eval_result_all['Category'].unique()

for category in categories:
    category_df = eval_result_all[eval_result_all['Category'] == category]
    
    y_true = category_df['Expert_Decision_Incl']
    y_pred = category_df['Model_Decision']
    
    conf_matrix = confusion_matrix(y_true, y_pred)
    kappa_score = cohen_kappa_score(y_true, y_pred)
    
    print(f"Results for Category: {category}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Cohen's Kappa Score:", kappa_score)
    print("\n" + "="*50 + "\n")                 #line breaks

Patients Eligible: 17
Patients Ineligible: 982

Overall Results:

Confusion Matrix:
[[244736   1743]
 [ 17776     40]]
Cohen's Kappa Score: -0.008284804401193613


Results for Category: AGE
Confusion Matrix:
[[  2 981]
 [  0  17]]
Cohen's Kappa Score: 6.931221892869655e-05


Results for Category: GENDER
Confusion Matrix:
[[419 564]
 [  0  17]]
Cohen's Kappa Score: 0.024636573361510705


Results for Category: DISEASE
Confusion Matrix:
[[244315    198]
 [ 17776      6]]
Cohen's Kappa Score: -0.0008720132019399696




In [None]:
##################################################################################################################################################

In [None]:
############################################################### Model 2 ##########################################################################

In [None]:
##################################################################################################################################################

In [207]:
######################################## Matching with a Transformer on Disease Category ########################################

updated_df_disease = updated_df[updated_df['Category'] == 'DISEASE']

# Load the SentenceTransformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Function to calculate match percentage using cosine similarity
def calculate_match_percentage(criteria, value, category):
    if criteria is None or value is None:
        return 0
    
    # Convert both criteria and value to embeddings
    criteria_embedding = model.encode([str(criteria)])
    value_embedding = model.encode([str(value)])
    
    # Calculate cosine similarity between embeddings
    similarity_score = cosine_similarity(criteria_embedding, value_embedding)
    
    # Return match percentage (multiply by 100 for percentage scale)
    return similarity_score[0][0] * 100

# Function to match patients to trial criteria
def match_patients_to_criteria(updated_df, table_dataframes):
    results = []

    for _, row in updated_df.iterrows():
        trial_name = row['Trial_Name']
        trial_id = row['Trial_ID']
        inclusion_criteria = row['Inclusion_Criteria']
        category = row['Category']
        source_column = row['Source_Columns']
        
        table_name, column_name = source_column.split('.')
        
        if table_name in table_dataframes:
            df = table_dataframes[table_name]
            
            if column_name in df.columns:
                for _, patient_row in df.iterrows():
                    patient_id = patient_row['patientid']
                    source_value = patient_row[column_name]
                    
                    # Calculate match percentage using cosine similarity
                    match_percentage = calculate_match_percentage(inclusion_criteria, source_value, category)
                    
                    results.append({
                        'Patient_ID': patient_id,
                        'Trial_Name': trial_name,
                        'Trial_ID': trial_id,
                        'Inclusion_Criteria': inclusion_criteria,
                        'Category': category,
                        'Source_Column': source_column,
                        'Source_Value': source_value,
                        'Match_Percentage': match_percentage
                    })

    return pd.DataFrame(results)

# Example usage
result_df_disease = match_patients_to_criteria(updated_df_disease, table_dataframes)
result_df_disease['Expert_Decision_Incl'] = result_df_disease['Patient_ID'].isin(eligible_patient_ids).astype(int)

# Print the result dataframe
print(result_df_disease)

                                  Patient_ID  Trial_Name  Trial_ID  \
0       8F788B33-EBC7-438D-B8E7-2F6AC7F8F434  Test_Trial       123   
1       BDBE84EA-66D0-4E4F-9E72-ADD982731073  Test_Trial       123   
2       8F788B33-EBC7-438D-B8E7-2F6AC7F8F434  Test_Trial       123   
3       BDBE84EA-66D0-4E4F-9E72-ADD982731073  Test_Trial       123   
4       06040AEE-66C8-4EE6-AC18-05731251CC56  Test_Trial       123   
...                                      ...         ...       ...   
262290  A2391F9A-1D24-4FAD-B624-93BD6165ED92  Test_Trial       123   
262291  A2391F9A-1D24-4FAD-B624-93BD6165ED92  Test_Trial       123   
262292  A2391F9A-1D24-4FAD-B624-93BD6165ED92  Test_Trial       123   
262293  A2391F9A-1D24-4FAD-B624-93BD6165ED92  Test_Trial       123   
262294  A2391F9A-1D24-4FAD-B624-93BD6165ED92  Test_Trial       123   

                Inclusion_Criteria Category Source_Column  \
0       Diagnosed with Lung Cancer  DISEASE  LABS.disease   
1       Diagnosed with Lung Cancer  D

In [208]:
######################################## Evaluate Patient Level Match Percentage by Inclusion Criteria ########################################

eval_result_df_disease = result_df_disease.copy()
eval_result_df_disease['Model_Decision'] = eval_result_df_disease['Match_Percentage'].apply(lambda x: 1 if x > 90 else 0)

# Calculate overall confusion matrix and kappa score
y_true_disease = eval_result_df_disease['Expert_Decision_Incl']
y_pred_disease = eval_result_df_disease['Model_Decision']

conf_matrix_disease = confusion_matrix(y_true_disease, y_pred_disease)
kappa_score_disease = cohen_kappa_score(y_true_disease, y_pred_disease)

# Print overall results
# Print summary of eligible and ineligible patients
print(f"Results for Category: DISEASE")
print(f"\nPatients Eligible: {len(eligible_patient_ids)}")
print(f"Patients Ineligible: {len(ineligible_patient_ids)}")
print("\nOverall Results:")
print("\nConfusion Matrix:")
print(conf_matrix_disease)
print("Cohen's Kappa Score:", kappa_score_disease)
print("\n" + "="*50 + "\n")

Results for Category: DISEASE

Patients Eligible: 17
Patients Ineligible: 982

Overall Results:

Confusion Matrix:
[[244513      0]
 [ 17782      0]]
Cohen's Kappa Score: 0.0




In [None]:
##################################################################################################################################################

In [None]:
############################################################### Model 3 ##########################################################################

In [None]:
##################################################################################################################################################

In [202]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Load the SentenceTransformer model and NER pipeline
transformer_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Load the biomedical-ner-all model
tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")
biomedical_ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Function to extract biomedical entities using NER pipeline
def extract_entities(text):
    if not text:
        return []
    
    # Use the NER pipeline to extract entities
    ner_results = biomedical_ner_pipeline(text)
    
    # Return a list of recognized entities
    return [entity['word'] for entity in ner_results]

# Function to calculate match percentage using cosine similarity
def calculate_match_percentage(criteria, value, category):
    if not criteria or not value:
        return 0
    
    # Extract entities from both criteria and value
    criteria_entities = extract_entities(criteria)
    value_entities = extract_entities(value)

    # If entities are extracted, compare them; else fallback to raw text
    if criteria_entities and value_entities:
        # Convert entities to embeddings
        criteria_embedding = transformer_model.encode([" ".join(criteria_entities)])
        value_embedding = transformer_model.encode([" ".join(value_entities)])
    else:
        # Fallback to raw text embeddings
        criteria_embedding = transformer_model.encode([str(criteria)])
        value_embedding = transformer_model.encode([str(value)])
    
    # Calculate cosine similarity
    similarity_score = cosine_similarity(criteria_embedding, value_embedding)
    return similarity_score[0][0] * 100

# Function to match patients to trial criteria
def match_patients_to_criteria(updated_df, table_dataframes):
    results = []

    for _, row in updated_df.iterrows():
        trial_name = row['Trial_Name']
        trial_id = row['Trial_ID']
        inclusion_criteria = row['Inclusion_Criteria']
        category = row['Category']
        source_column = row['Source_Columns']
        
        table_name, column_name = source_column.split('.')
        
        if table_name in table_dataframes:
            df = table_dataframes[table_name]
            
            if column_name in df.columns:
                for _, patient_row in df.iterrows():
                    patient_id = patient_row['patientid']
                    source_value = patient_row[column_name]
                    
                    # Calculate match percentage using cosine similarity
                    match_percentage = calculate_match_percentage(inclusion_criteria, source_value, category)
                    
                    results.append({
                        'Patient_ID': patient_id,
                        'Trial_Name': trial_name,
                        'Trial_ID': trial_id,
                        'Inclusion_Criteria': inclusion_criteria,
                        'Category': category,
                        'Source_Column': source_column,
                        'Source_Value': source_value,
                        'Match_Percentage': match_percentage
                    })

    return pd.DataFrame(results)

# Filter for disease category
updated_df_disease = updated_df[updated_df['Category'] == 'DISEASE']

# Match patients to trials
result_df_disease = match_patients_to_criteria(updated_df_disease, table_dataframes)

# Add expert decision column
result_df_disease['Expert_Decision_Incl'] = result_df_disease['Patient_ID'].isin(eligible_patient_ids).astype(int)

# Print the result dataframe
print(result_df_disease)

                                  Patient_ID  Trial_Name  Trial_ID  \
0       8F788B33-EBC7-438D-B8E7-2F6AC7F8F434  Test_Trial       123   
1       BDBE84EA-66D0-4E4F-9E72-ADD982731073  Test_Trial       123   
2       8F788B33-EBC7-438D-B8E7-2F6AC7F8F434  Test_Trial       123   
3       BDBE84EA-66D0-4E4F-9E72-ADD982731073  Test_Trial       123   
4       06040AEE-66C8-4EE6-AC18-05731251CC56  Test_Trial       123   
...                                      ...         ...       ...   
262290  A2391F9A-1D24-4FAD-B624-93BD6165ED92  Test_Trial       123   
262291  A2391F9A-1D24-4FAD-B624-93BD6165ED92  Test_Trial       123   
262292  A2391F9A-1D24-4FAD-B624-93BD6165ED92  Test_Trial       123   
262293  A2391F9A-1D24-4FAD-B624-93BD6165ED92  Test_Trial       123   
262294  A2391F9A-1D24-4FAD-B624-93BD6165ED92  Test_Trial       123   

                Inclusion_Criteria Category Source_Column  \
0       Diagnosed with Lung Cancer  DISEASE  LABS.disease   
1       Diagnosed with Lung Cancer  D

In [206]:
######################################## Evaluate Model 3 ########################################

eval_result_df_disease = result_df_disease.copy()
eval_result_df_disease['Model_Decision'] = eval_result_df_disease['Match_Percentage'].apply(lambda x: 1 if x > 90 else 0)

# Calculate overall confusion matrix and kappa score
y_true_disease = eval_result_df_disease['Expert_Decision_Incl']
y_pred_disease = eval_result_df_disease['Model_Decision']

conf_matrix_disease = confusion_matrix(y_true_disease, y_pred_disease)
kappa_score_disease = cohen_kappa_score(y_true_disease, y_pred_disease)

# Print overall results
# Print summary of eligible and ineligible patients
print(f"Results for Category: DISEASE")
print(f"\nPatients Eligible: {len(eligible_patient_ids)}")
print(f"Patients Ineligible: {len(ineligible_patient_ids)}")
print("\nOverall Results:")
print("\nConfusion Matrix:")
print(conf_matrix_disease)
print("Cohen's Kappa Score:", kappa_score_disease)
print("\n" + "="*50 + "\n")

Results for Category: DISEASE

Patients Eligible: 17
Patients Ineligible: 982

Overall Results:

Confusion Matrix:
[[244513      0]
 [ 17782      0]]
Cohen's Kappa Score: 0.0




In [None]:
##################################################################################################################################################

In [None]:
############################################################### Model 4 ##########################################################################

In [None]:
##################################################################################################################################################

In [11]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Load the SentenceTransformer model and biomedical NER pipeline
transformer_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Load the biomedical-ner-all model
tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")
biomedical_ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Function to extract biomedical entities using the NER pipeline
def extract_entities(text):
    if not text:
        return []
    ner_results = biomedical_ner_pipeline(text)
    return [entity['word'] for entity in ner_results]

# Function to calculate match percentage using NER and cosine similarity
def calculate_match_percentage(criteria, value):
    if not criteria or not value:
        return 0

    # Extract entities from criteria and value
    criteria_entities = extract_entities(criteria)
    value_entities = extract_entities(value)

    # Use extracted entities if available, otherwise fallback to raw text
    if criteria_entities and value_entities:
        criteria_text = " ".join(criteria_entities)
        value_text = " ".join(value_entities)
    else:
        criteria_text = str(criteria)
        value_text = str(value)
    
    # Generate embeddings
    criteria_embedding = transformer_model.encode([criteria_text])
    value_embedding = transformer_model.encode([value_text])
    
    # Calculate cosine similarity
    similarity_score = cosine_similarity(criteria_embedding, value_embedding)
    return similarity_score[0][0] * 100

# Function to match patients to trial criteria
def match_patients_to_criteria(updated_df, table_dataframes):
    results = []

    for _, row in updated_df.iterrows():
        trial_name = row['Trial_Name']
        trial_id = row['Trial_ID']
        inclusion_criteria = row['Inclusion_Criteria']
        category = row['Category']
        source_column = row['Source_Columns']
        
        table_name, column_name = source_column.split('.')
        
        if table_name in table_dataframes:
            df = table_dataframes[table_name]
            
            if column_name in df.columns:
                for _, patient_row in df.iterrows():
                    patient_id = patient_row['patientid']
                    source_value = patient_row[column_name]
                    
                    # Calculate match percentage
                    match_percentage = calculate_match_percentage(inclusion_criteria, source_value)
                    
                    results.append({
                        'Patient_ID': patient_id,
                        'Trial_Name': trial_name,
                        'Trial_ID': trial_id,
                        'Inclusion_Criteria': inclusion_criteria,
                        'Category': category,
                        'Source_Column': source_column,
                        'Source_Value': source_value,
                        'Match_Percentage': match_percentage
                    })

    return pd.DataFrame(results)

# Filter for disease category
updated_df_disease = updated_df[updated_df['Category'] == 'DISEASE']

# Match patients to trials
result_df_disease = match_patients_to_criteria(updated_df_disease, table_dataframes)

# Add expert decision column
result_df_disease['Expert_Decision_Incl'] = result_df_disease['Patient_ID'].isin(eligible_patient_ids).astype(int)

# Print the result dataframe
print(result_df_disease)

                                  Patient_ID  Trial_Name  Trial_ID  \
0       8F788B33-EBC7-438D-B8E7-2F6AC7F8F434  Test_Trial       123   
1       BDBE84EA-66D0-4E4F-9E72-ADD982731073  Test_Trial       123   
2       8F788B33-EBC7-438D-B8E7-2F6AC7F8F434  Test_Trial       123   
3       BDBE84EA-66D0-4E4F-9E72-ADD982731073  Test_Trial       123   
4       06040AEE-66C8-4EE6-AC18-05731251CC56  Test_Trial       123   
...                                      ...         ...       ...   
262290  A2391F9A-1D24-4FAD-B624-93BD6165ED92  Test_Trial       123   
262291  A2391F9A-1D24-4FAD-B624-93BD6165ED92  Test_Trial       123   
262292  A2391F9A-1D24-4FAD-B624-93BD6165ED92  Test_Trial       123   
262293  A2391F9A-1D24-4FAD-B624-93BD6165ED92  Test_Trial       123   
262294  A2391F9A-1D24-4FAD-B624-93BD6165ED92  Test_Trial       123   

                Inclusion_Criteria Category Source_Column  \
0       Diagnosed with Lung Cancer  DISEASE  LABS.disease   
1       Diagnosed with Lung Cancer  D

In [12]:
############################################################ Evaluate Model 4 #####################################################################

eval_result_df_disease = result_df_disease.copy()
eval_result_df_disease['Model_Decision'] = eval_result_df_disease['Match_Percentage'].apply(lambda x: 1 if x > 60 else 0)

# Calculate overall confusion matrix and kappa score
y_true_disease = eval_result_df_disease['Expert_Decision_Incl']
y_pred_disease = eval_result_df_disease['Model_Decision']

conf_matrix_disease = confusion_matrix(y_true_disease, y_pred_disease)
kappa_score_disease = cohen_kappa_score(y_true_disease, y_pred_disease)

# Print overall results
# Print summary of eligible and ineligible patients
print(f"Results for Category: DISEASE")
print(f"\nPatients Eligible: {len(eligible_patient_ids)}")
print(f"Patients Ineligible: {len(ineligible_patient_ids)}")
print("\nOverall Results:")
print("\nConfusion Matrix:")
print(conf_matrix_disease)
print("Cohen's Kappa Score:", kappa_score_disease)
print("\n" + "="*50 + "\n")

Results for Category: DISEASE

Patients Eligible: 17
Patients Ineligible: 982

Overall Results:

Confusion Matrix:
[[244513      0]
 [ 17776      6]]
Cohen's Kappa Score: 0.0006289062782696142




In [14]:
result_df_disease.to_csv('result_df_disease_model4_2.csv', index=False) 