# Eligibility for mobilization: Cohort identification. 

This script identifies the cohort using CLIF 2.0 tables. 

Requirements:
* Required table filenames should be clif_patient, clif_hospitalization, clif_adt, clif_vitals, clif_labs, clif_medication_admin_continuous, clif_respiratory_support

## Load Libraries

In [1]:
import sys
import os
import time
import pandas as pd
import numpy as np
import duckdb
import pyCLIF

Loaded configuration from config.json
{'site_name': 'UCMC', 'tables_path': '/Users/kavenchhikara/Desktop/CLIF/CLIF-UCMC/rclif/c19', 'file_type': 'parquet'}


In [2]:
## the output should be the location of this file
os.getcwd()

'/Users/kavenchhikara/Desktop/CLIF/CLIF-eligibility-for-mobilization/code'

## Load the config details

In [5]:
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from utils import config
# Load the configuration
config = config.load_config()

Loaded configuration from config.json


In [6]:
# Access configuration parameters
site_name = config['site_name']
tables_path = config['tables_path']
file_type = config['file_type']
output_path = os.path.join("..", "output", "final")

# Make sure the directory exists; if not, create it
if not os.path.exists(output_path):
    os.makedirs(output_path)

# Print the configuration parameters
print(f"Site Name: {site_name}")
print(f"Tables Path: {tables_path}")
print(f"Output path: {output_path}")
print(f"File Type: {file_type}")

Site Name: UCMC
Tables Path: /Users/kavenchhikara/Desktop/CLIF/CLIF-UCMC/rclif/c19
Output path: ../output/final
File Type: parquet


In [None]:
## Confirm that these are the correct paths
patient_filepath                 = f"{tables_path}/clif_patient.{file_type}"
hospitalization_filepath         = f"{tables_path}/clif_hospitalization.{file_type}"
adt_filepath                     = f"{tables_path}/clif_adt.{file_type}"
vitals_filepath                  = f"{tables_path}/rclif/clif_vitals.{file_type}"
labs_filepath                    = f"{tables_path}/rclif/clif_labs.{file_type}"
meds_filepath                    = f"{tables_path}/rclif/clif_medication_admin_continuous.{file_type}"
resp_support_filepath            = f"{tables_path}/rclif/clif_respiratory_support.{file_type}"

## Common Functions

In [9]:
def read_data(filepath, file_type):
    """
    Read data from file based on file type.
    Parameters:
        filepath (str): Path to the file.
        file_type (str): Type of the file ('csv' or 'parquet').
    Returns:
        DataFrame: DataFrame containing the data.
    """
    start_time = time.time()  # Record the start time
    file_name = os.path.basename(filepath) 
    if file_type == 'csv':
        df = pd.read_csv(filepath)
    elif file_type == 'parquet':
        table = pq.read_table(filepath)
        df = table.to_pandas()
    else:
        raise ValueError("Unsupported file type. Please provide either 'csv' or 'parquet'.")
    
    end_time = time.time()  # Record the end time
    load_time = end_time - start_time  # Calculate the loading time
    
    # Calculate the size of the loaded dataset in MB
    dataset_size_mb = df.memory_usage(deep=True).sum() / (1024 * 1024)
    print(f"File name: {file_name}")
    print(f"Time taken to load the dataset: {load_time:.2f} seconds")
    print(f"Size of the loaded dataset: {dataset_size_mb:.2f} MB\n")
    
    return df
    
def count_unique_encounters(df, encounter_column='encounter_id'):
    """
    Counts the unique encounters in a DataFrame.
    
    Parameters:
    df (DataFrame): The DataFrame to analyze.
    encounter_column (str): The name of the column containing encounter IDs (default is 'encounter_id').
    
    Returns:
    int: The number of unique encounters.
    """
    return df[encounter_column].nunique()


def generate_facetgrid_histograms(data, category_column, value_column):
    """
    Generate histograms using seaborn's FacetGrid.

    Parameters:
        data (DataFrame): DataFrame containing the data.
        category_column (str): Name of the column containing categories.
        value_column (str): Name of the column containing values.

    Returns:
        FacetGrid: Seaborn FacetGrid object containing the generated histograms.
    """
    # Create a FacetGrid
    g = sns.FacetGrid(data, col=category_column, col_wrap=6, sharex=False, sharey=False)
    g.map(sns.histplot, value_column, bins=30, color='blue', edgecolor='black')

    # Set titles and labels
    g.set_titles('{col_name}')
    g.set_axis_labels(value_column, 'Frequency')

    # Adjust layout
    plt.subplots_adjust(top=0.9)
    g.fig.suptitle(f'Histograms of {value_column} by {category_column}', fontsize=16)

    return g

def standardize_datetime(df):
    """
    Ensure that all *_dttm variables are in the correct format.
    Convert all datetime columns to a specific precision and remove timezone
    Parameters:
        DataFrame: DataFrame containing the data.
    Returns:
        DataFrame: DataFrame containing the data.
    """
    for col in df.columns:
        if pd.api.types.is_datetime64_any_dtype(df[col]):
            # Here converting to 'datetime64[ns]' for uniformity and removing timezone with 'tz_convert(None)'
            df[col] = df[col].dt.tz_convert(None) if df[col].dt.tz is not None else df[col]
            # If you need to standardize to UTC and keep the timezone:
            # df[col] = df[col].dt.tz_localize('UTC') if df[col].dt.tz is None else df[col].dt.tz_convert('UTC')
    return df

def get_sql_import(file_type):
    if file_type == 'parquet':
        return 'read_parquet'
    if file_type == 'csv':
        return 'read_csv_auto'

sql_import = get_sql_import(file_type=file_type)

## Load data

In [6]:
patient = pyCLIF.load_data('clif_patient')
hospitalization = pyCLIF.load_data('clif_hospitalization')
adt = pyCLIF.load_data('clif_adt')
vitals = pyCLIF.load_data('clif_vitals')
labs = pyCLIF.load_data('clif_labs')
meds = pyCLIF.load_data('clif_medication_admin_continuous')
resp_support = pyCLIF.load_data('clif_respiratory_support')

Table final path: /Users/kavenchhikara/Desktop/CLIF/CLIF-UCMC/rclif/c19/clif_patient.parquet
Data loaded successfully from /Users/kavenchhikara/Desktop/CLIF/CLIF-UCMC/rclif/c19/clif_patient.parquet
Table final path: /Users/kavenchhikara/Desktop/CLIF/CLIF-UCMC/rclif/c19/clif_hospitalization.parquet
Data loaded successfully from /Users/kavenchhikara/Desktop/CLIF/CLIF-UCMC/rclif/c19/clif_hospitalization.parquet
Table final path: /Users/kavenchhikara/Desktop/CLIF/CLIF-UCMC/rclif/c19/clif_adt.parquet
Data loaded successfully from /Users/kavenchhikara/Desktop/CLIF/CLIF-UCMC/rclif/c19/clif_adt.parquet
Table final path: /Users/kavenchhikara/Desktop/CLIF/CLIF-UCMC/rclif/c19/clif_vitals.parquet
Data loaded successfully from /Users/kavenchhikara/Desktop/CLIF/CLIF-UCMC/rclif/c19/clif_vitals.parquet
Table final path: /Users/kavenchhikara/Desktop/CLIF/CLIF-UCMC/rclif/c19/clif_labs.parquet
Data loaded successfully from /Users/kavenchhikara/Desktop/CLIF/CLIF-UCMC/rclif/c19/clif_labs.parquet
Table fina

## Project specific QC 

In [None]:
# check that all these tables have the required variables
# check variable data types are correct; if not correct them
# check the particular categories we want 
# check that the categories value range is appropriate
# check that the tables do not have duplicates

In [None]:
# Define expected schemas
expected_columns_clif_patient = {
    'patient_id': 'varchar',
    'race_category': 'varchar',
    'ethnicity_category': 'varchar',
    'sex_category': 'varchar'
}

expected_columns_clif_hospitalization = {
    'patient_id': 'varchar',
    'hospitalization_id': 'varchar',
    'admission_dttm': 'datetime',
    'discharge_dttm': 'datetime',
    'age_at_admission': 'int'
}

expected_categories_clif_vitals = {
    'vital_category': ['heart_rate', 'resp_rate', 'sbp', 'dbp', 'map', 'spo2']
}

# Load data
df_patient = pyCLIF.load_data('clif_patient')
df_hospitalization = pyCLIF.load_data('clif_hospitalization')
df_vitals = pyCLIF.load_data('clif_vitals')

# Perform QC checks
qc_report_patient = pyCLIF.qc_check(df_patient, 'clif_patient', expected_columns_clif_patient)
qc_report_hospitalization = pyCLIF.qc_check(df_hospitalization, 'clif_hospitalization', expected_columns_clif_hospitalization)
qc_report_vitals = pyCLIF.qc_check(df_vitals, 'clif_vitals', expected_columns={
    'hospitalization_id': 'varchar',
    'recorded_dttm': 'datetime',
    'vital_category': 'varchar',
    'vital_value': 'float'
}, expected_categories=expected_categories_clif_vitals)

# Display QC reports
print(qc_report_patient)
print(qc_report_hospitalization)
print(qc_report_vitals)