In [1]:
######################################## Check Environment ########################################
import sys
print("Python executable:", sys.executable)
print("Python version:", sys.version)

Python executable: /opt/homebrew/anaconda3/envs/venv_mitsui_condapy310/bin/python
Python version: 3.10.15 | packaged by conda-forge | (main, Oct 16 2024, 01:24:20) [Clang 17.0.6 ]


In [2]:
######################################## Install packages ########################################

import pandas as pd
import numpy as np
# Create chunks
import re

# Model for NER
import spacy 
from sklearn.cluster import KMeans
import medspacy
from medspacy.ner import TargetRule
from thefuzz import fuzz, process

#UMLSClient for NER
import umls_api
from umls_api_client import UMLS
from quickumls import QuickUMLS

# Use natural language processing (NLP) to extract keywords from the criteria
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')
from sentence_transformers import SentenceTransformer, util

# Performance
import sklearn
from sklearn.metrics import cohen_kappa_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

import snowflake.connector

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danageorge/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/danageorge/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/danageorge/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
######################################## Connect to Snowflake ########################################

# Establish a connection
conn = snowflake.connector.connect(
    user='dana_george@hakkoda.io',
    authenticator='externalbrowser',
    account='ska04930.east-us-2.azure',
    warehouse='DATASCIENCE_WH',
    database='ONCOEMR_RAW_DEV',
    schema='DBO',
    role='ACCOUNTADMIN'
)

# Run a test query
cursor = conn.cursor()
cursor.execute("SELECT CURRENT_VERSION()")
row = cursor.fetchone()
print("Snowflake version:", row[0])

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...
Going to open: https://login.microsoftonline.com/bbaf0319-e615-416f-8870-f7eacf074b66/saml2?SAMLRequest=nZJPc9owEMW%2Fikc925b%2FAEEDZGgorWdoYTD0kJuw16DBlhytHId%2B%2BsoGZtJDcuhNI73d39O%2BnTy%2BVaXzChqFklMSeJQ4IDOVC3mckv1u6T4QBw2XOS%2BVhCm5AJLH2QR5VdZs3piT3MJLA2gc20gi6x6mpNGSKY4CmeQVIDMZS%2Bc%2FVyz0KOOIoI3FkVtJjsKyTsbUzPfbtvXayFP66IeUUp%2BOfavqJF%2FIO0T9OaPWyqhMlfeSN%2FunDxCBT%2BMOYRWWsLkVfhXyOoLPKIerCNmP3W7jbtbpjjjz%2B%2B%2BelMSmAp2CfhUZ7LerqwG0DvDMaTyOqAccjdugG3r8T6PBQ6naouRnyFRVN8Y29%2BzJLyD3S3UUdmTJYkrqs8jXx28pXQUqVXFTrPWyTfZyxZ%2BhSi7b7fdB9DIPaZbGp5RnGXF%2B3wMOu4ATxAYS2cVq7BUNYzcI3DDYUcqiiA2GXhSFz8RZ2FiF5KavvHvvfXiVyLRCVRglSyGhd3k48IJGwdiFYTBw42BYuA8PI%2BoWI%2BBZQUfxYTj0u%2FBCcl0g1hvRs%2F8cy8R%2F3%2BS2kr9sSslio0qRXZyl0hU3H4cYeEF%2FI3K36KUMKi7KeZ5rQLRhlq

In [4]:
######################################## Load Data ########################################

# Get sample patient ids
cursor = conn.cursor()
cursor.execute("""
    SELECT DISTINCT "patientid" FROM ONCOEMR_RAW_DEV.DBO.DEMOGRAHPICS
    ORDER BY RANDOM()
""")
sample_patient_ids = [row[0] for row in cursor.fetchall()]
sample_patient_ids = [f"'{id}'" if isinstance(id, str) else str(id) for id in sample_patient_ids]

cursor = conn.cursor()
cursor.execute("""
    SELECT table_name
    FROM information_schema.tables
    WHERE table_schema = 'DBO'
    AND table_catalog = 'ONCOEMR_RAW_DEV'
    AND table_type = 'BASE TABLE';
""")

# Fetch all the table names
tables = [row[0] for row in cursor.fetchall()]
#print(tables)

# Create a dictionary to hold each table as a DataFrame
table_dataframes = {}
table_dataframes_spat = {}

for table in tables:
    # First, check if the table contains 'patientid' by querying the columns of the table
    cursor.execute(f"""
        SELECT column_name
        FROM information_schema.columns
        WHERE table_schema = 'DBO' AND table_name = '{table}'
    """)
    
    columns = [row[0] for row in cursor.fetchall()]
    
    # If 'patientid' is a column, proceed to query the table
    if 'patientid' in columns:
        query = f"""
            SELECT *
            FROM ONCOEMR_RAW_DEV.DBO.{table}
            WHERE "patientid" IN ({', '.join(map(str, sample_patient_ids))})
        """
        cursor.execute(query)
        
        # Fetch the result and convert it to a DataFrame
        results = cursor.fetchall()
        columns = [desc[0] for desc in cursor.description]
        table_dataframes[table] = pd.DataFrame(results, columns=columns)

    # If 'spatientid' is a column, proceed to query the table
    if 'spatientid' in columns:
        query = f"""
            SELECT *
            FROM ONCOEMR_RAW_DEV.DBO.{table}
            WHERE "spatientid" IN ({', '.join(map(str, sample_patient_ids))})
        """
        cursor.execute(query)
        
        # Fetch the result and convert it to a DataFrame
        results_spat = cursor.fetchall()
        columns_spat = [desc[0] for desc in cursor.description]
        table_dataframes_spat[table] = pd.DataFrame(results_spat, columns=columns_spat)

# Merge table_dataframes_spat into table_dataframes_pat
table_dataframes.update(table_dataframes_spat)

# Now table_dataframes_pat contains all the tables from both dictionaries
print("Data Loaded Successfully!")
print(" ")
print("Tables Loaded:")
for table, df in table_dataframes.items():
    print(f"{table}")
    #print(df.head())

# Loop through the dictionary to create a separate DataFrame variable for each key
for table, df in table_dataframes.items():
    globals()[table] = df

# Now you can access the DataFrames as individual variables:
# print(ADMINISTRATIONS.head())

# Bring in clinical trial data
clinical_trials_excl = pd.read_csv('clinical_trials_data_simple_exclusion.csv')
clinical_trials_incl = pd.read_csv('bonecancer_incl.csv')
print("clinical_trials_data_simple_exclusion")
print("clinical_trials_data_simple_inclusion")
print(" ")

def print_columns_of_dict_of_dfs(df_dict):
    """Prints the columns of each DataFrame in a dictionary of DataFrames."""

    for df_name, df in df_dict.items():
        print(f"Columns of {df_name}:")
        print(df.columns)
        print("-" * 20)

# Call the function to print the columns
print_columns_of_dict_of_dfs(table_dataframes)

Data Loaded Successfully!
 
Tables Loaded:
ADMINISTRATIONS
ADVANCEDIRECTIVES
ALLERGY
CHARGE
DEMOGRAHPICS
DEMOGRAPHICS
DIAGNOSIS
DISEASESTATUS
ERX
FAMILYHISTORY
HOSPITALIZATION
INSURANCE
LABS
ORDERS
RADIOLOGY
REFERRINGPROVIDER
SOCIALHISTORY
TRANSFUSION
GRADESCALES
SURGICALHISTORY
PERFORMANCE
VISIT
BIOMARKERS
TOXICITIES
MEDICATIONLIST
STAGING
DATA_HISTORY
PATIENT_LOCATION_HISTORY
ORDER_CHARGE_HISTORY
TREATMENT_CURRENT_HISTORY
VITAL_SIGN_HISTORY
TREATMENT_PREVIOUS_HISTORY
clinical_trials_data_simple_exclusion
clinical_trials_data_simple_inclusion
 
Columns of ADMINISTRATIONS:
Index(['clientid', 'administrationid', 'diagnosisid', 'doseadministered',
       'doseapproved', 'drugname', 'duration', 'intent', 'endreason', 'form',
       'targetdrugname', 'targetdrugshortname', 'targetdrugcategory', 'ndc',
       'nodosestaken', 'orderedamount', 'ordereddate', 'administeredunits',
       'targetadministeredunits', 'orderid', 'patientid', 'plannedcycles',
       'providerid', 'orderhassignoff', 

In [5]:
######################################## Feature Engineering ########################################

# Convert non-numeric values to NaN
DEMOGRAPHICS['age'] = pd.to_numeric(DEMOGRAPHICS['age'], errors='coerce')

# Now, convert the column to integers (NaNs will remain as NaN)
DEMOGRAPHICS['age'] = DEMOGRAPHICS['age'].fillna(-1).astype(int)  
print("Feature Engineering Complete!")

Feature Engineering Complete!


In [6]:
######################################## Quality Check ########################################
print(clinical_trials_incl.columns)
print(DEMOGRAPHICS)

Index(['Trial_Name', 'Trial_ID', 'Inclusion_Criteria'], dtype='object')
      RowID clientid                             patientid patientmrn  \
0    112304   CA0026  D6288764-EBAB-429D-9D6E-BA5152340FDD       None   
1    158902   CA0026  548CD51B-9AC0-4A25-9D48-B6D08675DD00       None   
2       924   CA0026  A552A6C5-63DB-4152-B0E3-4C1A94C9CF27       None   
3    109938   CA0026  ECCE7682-F192-4E4B-8915-D3808F81E60E       None   
4    131886   CA0026  B995B177-DA99-4BEF-9818-8E40DCD9841D       None   
..      ...      ...                                   ...        ...   
995     148   CA0026  9D1DAE72-E47F-453C-AF06-12AA2B97855C       None   
996     228   CA0026  E393B5A7-862D-43DD-97A2-4B2075FBD297       None   
997     398   CA0026  F90AA3C5-D2D7-4A6E-AFC5-024A1F912112       None   
998     878   CA0026  BE19FCC3-3107-400D-9C0B-1A6C3ADB8D5E       None   
999     275   CA0026  F124E3E2-C129-4FA2-8144-BF11CEC44CFB       None   

    patientssn         primaryphysicianid   dob  bi

In [7]:
######################################## Build Mock Expert Decision ########################################
# Perform the LEFT JOIN
merged_df = DEMOGRAPHICS.merge(DIAGNOSIS, on='patientid', how='left')

# Filter using "LIKE" equivalent
eligible = merged_df[
    # (merged_df['age'] >= 18) &
    # (merged_df['gender'] == 'Female') &
    (merged_df['targetdetaileddiagnosisgroup'].str.contains('bone', case=False, na=False)) &
    (merged_df['targetdetaileddiagnosisgroup'].str.contains('cancer', case=False, na=False))
]

# # For evaluation metrics later
# eligible['Expert_Decision_Age'] = 1
# eligible['Expert_Decision_Gender'] = 1
eligible['Expert_Decision_Disease'] = 1

#distinct_count = eligible['patientid'].nunique()

# Extract patient IDs that match the expert's eligibility criteria
eligible_patient_ids = eligible['patientid'].unique().tolist()

# Get patient IDs that are not in the eligible list
ineligible_patient_ids = merged_df[~merged_df['patientid'].isin(eligible_patient_ids)]['patientid'].unique().tolist()

print(len(eligible_patient_ids))
print(len(ineligible_patient_ids))

print(eligible)

6
993
      RowID clientid_x                             patientid patientmrn  \
918     289     CA0026  1C18C7F0-A512-40EF-AD48-B934521F9C7B       None   
1246    436     CA0026  0CA1BC14-02F2-47BC-B8B1-7EF62566D7D4       None   
1247    436     CA0026  0CA1BC14-02F2-47BC-B8B1-7EF62566D7D4       None   
2459    827     CA0026  01DE42C6-D20D-4BF8-A6DE-AA65AE7512BC       None   
2478    274     CA0026  771F10EB-EB2F-4D0D-BC16-340C18C975EF       None   
3180    494     CA0026  11C1721B-F992-4E34-8AC1-50405BA0E6A5       None   
3255    195     CA0026  5A1271D8-B7FD-4999-BCFF-35D3DD924D6C       None   
3256    195     CA0026  5A1271D8-B7FD-4999-BCFF-35D3DD924D6C       None   
3272    195     CA0026  5A1271D8-B7FD-4999-BCFF-35D3DD924D6C       None   
3273    195     CA0026  5A1271D8-B7FD-4999-BCFF-35D3DD924D6C       None   

     patientssn         primaryphysicianid   dob  birthyear  age   dod  ...  \
918        None         UID_Tz419171991_39  None        NaN   92  None  ...   
1246      

In [28]:
eligible.to_csv('test_eligible_breast.csv', index=False)

In [10]:
# ######################################## Use Medspacy: Create Entity/Label Pairs in Inclusion Criteria - 1 line of text, model testing ########################################

# ### Test to apply to 1 line of text
# ### Create Inclusion Criteria Categories to be used later in column matching/finding

# # Load the MedSpaCy model
# nlp = spacy.load('en_ner_bc5cdr_md')

# # Process your text
# text = "The patient is a female 18 years old and was diagnosed with breast cancer and prescribed Tamoxifen."

# # Function to extract entities and labels
# def extract_entities(text):
#     doc = nlp(text)
#     entities = [(ent.text, ent.label_) for ent in doc.ents]

#     # Custom check for age-related information (e.g., "18 years old")
#     age_pattern = r'\b(?:aged|over|under|above|below)?\s*(\d+)\s*(?:years? old|yrs?|yo)?\b'
#     age_matches = re.findall(age_pattern, text, re.IGNORECASE)
    
#     # If age-related information is found, add it to the entities with the correct label
#     for age in age_matches:
#         entities.append((f"{age} years old", 'AGE'))
    
#     # Custom check for gender-related information (e.g., "Male", "Female")
#     gender_keywords = ['female', 'male']  # We only need to check for 'female' and 'male'
    
#     # Check for the first gender-related term match (female first, then male)
#     gender_found = False
#     for gender in gender_keywords:
#         match = re.search(r'\b' + gender + r'\b', text, re.IGNORECASE)
#         if match:
#             entities.append((match.group(), 'GENDER'))
#             break  # Once a match is found, stop further checking

#     return entities

# # Display named entities and custom additions
# entities = extract_entities(text)
# for ent in entities:
#     print(f"Entity: {ent[0]}, Label: {ent[1]}")

Entity: breast cancer, Label: DISEASE
Entity: Tamoxifen, Label: CHEMICAL
Entity: 18 years old, Label: AGE
Entity: female, Label: GENDER


In [8]:
######################################## Use Medspacy: Create Entity/Label Pairs in Inclusion Criteria - Clinical Trial Dataframe ########################################

### Apply to a dataframe of trial data
### Create Inclusion Criteria Categories to be used later in column matching/finding

# Load the MedSpaCy model
nlp = spacy.load('en_ner_bc5cdr_md')

# Function to extract entities and labels
def extract_entities(text):
    # Process the text through the NLP model
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]

    # Custom check for age-related information (e.g., "18 years old")
    age_pattern = r'\b(?:aged|over|under|above|below)?\s*(\d+)\s*(?:years? old|yrs?|yo)?\b'
    age_matches = re.findall(age_pattern, text, re.IGNORECASE)
    
    # If age-related information is found, add it to the entities with the correct label
    for age in age_matches:
        entities.append((f"{age[0]} years old", 'AGE'))
    
    # Custom check for gender-related information (e.g., "Male", "Female")
    gender_keywords = ['female', 'male']  # We only need to check for 'female' and 'male'
    
    # Check for the first gender-related term match (female first, then male)
    gender_found = False
    for gender in gender_keywords:
        match = re.search(r'\b' + gender + r'\b', text, re.IGNORECASE)
        if match:
            entities.append((match.group(), 'GENDER'))
            break  # Once a match is found, stop further checking

    # Extract the unique labels to avoid duplicates and return them
    unique_labels = set([label for _, label in entities])
    return list(unique_labels)

# Apply the function to the inclusion_criteria column and create a new 'Category' column
clinical_trials_incl['Category'] = clinical_trials_incl['Inclusion_Criteria'].apply(lambda x: ', '.join(extract_entities(x)))

# Display the updated DataFrame
print(clinical_trials_incl)

   Trial_Name  Trial_ID          Inclusion_Criteria Category
0  Test_Trial       123            Aged 18 or over.      AGE
1  Test_Trial       123                      Female   GENDER
2  Test_Trial       123  Diagnosed with Bone Cancer  DISEASE


In [9]:
######################################## Use Fuzzy: Find columns in Patient Data that match Trial Inclusion Criteria ########################################

# Function to find exact matches and fuzzy matches
def find_matching_columns(category, dict_of_dfs, fuzzy_threshold=80):
    if category.lower() == 'disease':
        return ['DIAGNOSIS.targetdetaileddiagnosisgroup']
    
    # Step 1: Find exact matches (case-insensitive)
    exact_matches = []
    for df_name, df in dict_of_dfs.items():
        if category.lower() in [col.lower() for col in df.columns]:
            exact_column = next(col for col in df.columns if col.lower() == category.lower())
            exact_matches.append(f'{df_name}.{exact_column}')
            return exact_matches  # Return immediately after finding an exact match
    
    # Step 2: If no exact match, find fuzzy matches
    fuzzy_matches = []
    for df_name, df in dict_of_dfs.items():
        columns = df.columns
        for column in columns:
            score = process.extractOne(category, [column])  # Compare category with each column
            if score and score[1] >= fuzzy_threshold:  # If score is above threshold
                fuzzy_matches.append(f'{df_name}.{column}')
    
    return fuzzy_matches

# Loop through the clinical_trials_incl DataFrame and apply matching function
def add_source_columns(clinical_trials_incl, table_dataframes):
    source_columns_list = []
    
    for index, row in clinical_trials_incl.iterrows():
        category = row['Category']
        matching_columns = find_matching_columns(category, table_dataframes)
        
        # If there are multiple matches, list them, else return 'No match'
        if matching_columns:
            source_columns_list.append(', '.join(matching_columns))
        else:
            source_columns_list.append('No match')
    
    clinical_trials_incl['Source_Columns'] = source_columns_list
    return clinical_trials_incl

# Apply the function to the clinical_trials_incl DataFrame
clinical_trials_incl_ner = add_source_columns(clinical_trials_incl, table_dataframes)

# Display the updated DataFrame
print(clinical_trials_incl_ner)

######################### Now the clinical trial data is ready. #########################

   Trial_Name  Trial_ID          Inclusion_Criteria Category  \
0  Test_Trial       123            Aged 18 or over.      AGE   
1  Test_Trial       123                      Female   GENDER   
2  Test_Trial       123  Diagnosed with Bone Cancer  DISEASE   

                           Source_Columns  
0                        DEMOGRAHPICS.age  
1                     DEMOGRAHPICS.gender  
2  DIAGNOSIS.targetdetaileddiagnosisgroup  


In [10]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Load the SentenceTransformer model and biomedical NER pipeline
transformer_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Load the biomedical-ner-all model
tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")
biomedical_ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Function to extract biomedical entities using the NER pipeline
def extract_entities(text):
    if not text:
        return []
    ner_results = biomedical_ner_pipeline(text)
    return [entity['word'] for entity in ner_results]

# Function to calculate match percentage using NER and cosine similarity
def calculate_match_percentage(criteria, value):
    if not criteria or not value:
        return 0

    # Extract entities from criteria and value
    criteria_entities = extract_entities(criteria)
    value_entities = extract_entities(value)

    # Use extracted entities if available, otherwise fallback to raw text
    if criteria_entities and value_entities:
        criteria_text = " ".join(criteria_entities)
        value_text = " ".join(value_entities)
    else:
        criteria_text = str(criteria)
        value_text = str(value)
    
    # Generate embeddings
    criteria_embedding = transformer_model.encode([criteria_text])
    value_embedding = transformer_model.encode([value_text])
    
    # Calculate cosine similarity
    similarity_score = cosine_similarity(criteria_embedding, value_embedding)
    return similarity_score[0][0] * 100

# Function to match patients to trial criteria
def match_patients_to_criteria(clinical_trials_incl_ner_disease, table_dataframes):
    results = []
    all_patient_ids = set()

    # Collect all unique patient IDs
    for df in table_dataframes.values():
         if 'patientid' in df.columns:
            all_patient_ids.update(df['patientid'].unique())

    for _, row in clinical_trials_incl_ner_disease.iterrows():
        trial_name = row['Trial_Name']
        trial_id = row['Trial_ID']
        inclusion_criteria = row['Inclusion_Criteria']
        category = row['Category']
        source_column = row['Source_Columns']
        
        table_name, column_name = source_column.split('.')
        
        if table_name in table_dataframes:
            df = table_dataframes[table_name]
            
            if column_name in df.columns:
                for patient_id in all_patient_ids:
                    patient_rows = df[df['patientid'] == patient_id]
                    
                    if not patient_rows.empty:
                        for _, patient_row in patient_rows.iterrows():
                            source_value = patient_row[column_name]
                            match_percentage = (
                                calculate_match_percentage(inclusion_criteria, source_value) 
                                if pd.notna(source_value) else 0
                            )
                            
                            results.append({
                                'Patient_ID': patient_id,
                                'Trial_Name': trial_name,
                                'Trial_ID': trial_id,
                                'Inclusion_Criteria': inclusion_criteria,
                                'Category': category,
                                'Source_Column': source_column,
                                'Source_Value': source_value,
                                'Match_Percentage': match_percentage
                            })
                    else:
                        # If no matching rows, include a null result
                        results.append({
                            'Patient_ID': patient_id,
                            'Trial_Name': trial_name,
                            'Trial_ID': trial_id,
                            'Inclusion_Criteria': inclusion_criteria,
                            'Category': category,
                            'Source_Column': source_column,
                            'Source_Value': None,
                            'Match_Percentage': 0
                        })

    # Convert results to a DataFrame
    return pd.DataFrame(results)

# Usage
clinical_trials_incl_ner_disease = clinical_trials_incl_ner[clinical_trials_incl_ner['Category'] == 'DISEASE']
matched_disease = match_patients_to_criteria(clinical_trials_incl_ner_disease, table_dataframes)

# Step 1: Filter the eligible DataFrame to only the necessary columns
eligible_subset = eligible[['patientid', 'targetdetaileddiagnosisgroup', 'Expert_Decision_Disease']].drop_duplicates()

# Step 2: Perform a left join on matched_disease with eligible_subset
matched_disease = matched_disease.merge(
    eligible_subset,
    how='left',
    left_on=['Patient_ID', 'Source_Value'],
    right_on=['patientid', 'targetdetaileddiagnosisgroup'],
)

# Step 3: Fill missing values in Expert_Decision_Disease with 0
matched_disease['Expert_Decision_Disease'] = matched_disease['Expert_Decision_Disease'].fillna(0).astype(int)

# Step 4: (Optional) Debug the resulting DataFrame
print(matched_disease)

                                Patient_ID  Trial_Name  Trial_ID  \
0     2D6D0A37-E044-4D5A-85A8-0331B753A3F2  Test_Trial       123   
1     2D6D0A37-E044-4D5A-85A8-0331B753A3F2  Test_Trial       123   
2     2D6D0A37-E044-4D5A-85A8-0331B753A3F2  Test_Trial       123   
3     86E11975-50A1-4B40-9424-040FDD087809  Test_Trial       123   
4     51D81E7C-D95C-4338-827D-E8DA084F77C7  Test_Trial       123   
...                                    ...         ...       ...   
4377  4AEAE5F0-9EC7-4B55-AC30-5D8C5A16250D  Test_Trial       123   
4378  381BC62B-F71E-435E-87F4-B5BB88ABC4C2  Test_Trial       123   
4379  0C92FCC0-2D67-4058-B943-644157E00191  Test_Trial       123   
4380  0C92FCC0-2D67-4058-B943-644157E00191  Test_Trial       123   
4381  0C92FCC0-2D67-4058-B943-644157E00191  Test_Trial       123   

              Inclusion_Criteria Category  \
0     Diagnosed with Bone Cancer  DISEASE   
1     Diagnosed with Bone Cancer  DISEASE   
2     Diagnosed with Bone Cancer  DISEASE   
3  

In [12]:
matched_disease.to_csv('matched_disease_bone.csv', index=False)

In [13]:
######################################## Evaluate by Inclusion Criteria ########################################

eval_result_df_disease = matched_disease.copy()
eval_result_df_disease['Model_Decision'] = eval_result_df_disease['Match_Percentage'].apply(lambda x: 1 if x > 60 else 0)

# Calculate overall confusion matrix and kappa score
y_true_disease = eval_result_df_disease['Expert_Decision_Disease']
y_pred_disease = eval_result_df_disease['Model_Decision']

conf_matrix_disease = confusion_matrix(y_true_disease, y_pred_disease)
kappa_score_disease = cohen_kappa_score(y_true_disease, y_pred_disease)

# Print overall results
# Print summary of eligible and ineligible patients
print(f"Results for Category: DISEASE")
print(f"\nPatients Eligible: {len(eligible_patient_ids)}")
print(f"Patients Ineligible: {len(ineligible_patient_ids)}")
print("\nOverall Results:")
print("\nConfusion Matrix:")
print(conf_matrix_disease)
print("\nCohen's Kappa Score:", kappa_score_disease)
print("\n" + "="*50 + "\n")

Results for Category: DISEASE

Patients Eligible: 6
Patients Ineligible: 993

Overall Results:

Confusion Matrix:
[[4374    0]
 [   0    8]]

Cohen's Kappa Score: 1.0




In [20]:
history

######################################## Load Data ########################################

# Get sample patient ids
cursor = conn.cursor()
cursor.execute("""
    SELECT DISTINCT "patientid" FROM ONCOEMR_RAW_DEV.DBO.DEMOGRAHPICS
    ORDER BY RANDOM()
""")
sample_patient_ids = [row[0] for row in cursor.fetchall()]
sample_patient_ids = [f"'{id}'" if isinstance(id, str) else str(id) for id in sample_patient_ids]

cursor = conn.cursor()
cursor.execute("""
    SELECT table_name
    FROM information_schema.tables
    WHERE table_schema = 'DBO'
    AND table_catalog = 'ONCOEMR_RAW_DEV'
    AND table_type = 'BASE TABLE';
""")

# Fetch all the table names
tables = [row[0] for row in cursor.fetchall()]
#print(tables)

# Create a dictionary to hold each table as a DataFrame
table_dataframes = {}
table_dataframes_spat = {}

for table in tables:
    # First, check if the table contains 'patientid' by querying the columns of the table
    cursor.execute(f"""
        SELECT column_name
        FROM 