In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from ai_cdss.data_loader import DataLoader
from ai_cdss.data_processor import DataProcessor

In [2]:
rgs_mode = "app"
scoring_weights = [1,1,1]
ewma_alpha = 0.5

In [3]:
patient_list = [775]

In [4]:
loader = DataLoader(rgs_mode=rgs_mode)
processor = DataProcessor(weights=scoring_weights, alpha=ewma_alpha)

In [5]:
session = loader.load_session_data(patient_list=patient_list)
timeseries = loader.load_timeseries_data(patient_list=patient_list)
ppf = loader.load_ppf_data(patient_list=patient_list)

Database engine created successfully


INFO:ai_cdss.data_loader:Session data loaded successfully.


Database engine closed
Database engine created successfully
Database engine closed
Database engine created successfully


INFO:ai_cdss.data_loader:PPF data loaded successfully.


Database engine closed


#### Timeseries

`test_timeseries_dms_aggregation`

In [None]:
timeseries_dms_agg = processor.aggregate_dms_by_time(timeseries)

num_dms_timepoint = set(timeseries_dms_agg.groupby(["PATIENT_ID", "SESSION_ID", "PROTOCOL_ID", "SECONDS_FROM_START"])["DM_KEY"].count().values)

# Check only one dm per timepoint in Patient Sessions
assert len(num_dms_timepoint) == 1
assert len(num_dms_timepoint - {1}) == 0 

`test_ewma_computation`

In [None]:
def manual_ewma(values, alpha):
    """Manually compute EWMA with adjust=True."""
    ewma_values = []
    for t in range(len(values)):
        # Calculate weights
        weights = [(1 - alpha) ** i for i in range(t + 1)]
        # Calculate weighted sum
        weighted_sum = sum(w * x for w, x in zip(weights, reversed(values[:t + 1])))
        # Calculate EWMA
        ewma = weighted_sum / sum(weights)
        ewma_values.append(ewma)
    return ewma_values

# Test subset
test_patient = timeseries_dms_agg.PATIENT_ID.unique()[0]
test_protocol = timeseries_dms_agg.PROTOCOL_ID.unique()[0]

# Manual result
values = (
    timeseries_dms_agg[timeseries_dms_agg["PATIENT_ID"] == test_patient][timeseries_dms_agg["PROTOCOL_ID"] == test_protocol]["DM_VALUE"].tolist()
)
result_manual = manual_ewma(values, processor.alpha)

# Processor result
timeseries_dms_agg_ewma = processor.compute_ewma(timeseries_dms_agg, "DM_VALUE", ["PATIENT_ID", "PROTOCOL_ID"])
result_values = (
    timeseries_dms_agg_ewma[timeseries_dms_agg_ewma["PATIENT_ID"] == test_patient][timeseries_dms_agg_ewma["PROTOCOL_ID"] == test_protocol]["DM_VALUE"].tolist()
)

# Compare manually computed and function-computed EWMA values using assert
for manual, computed in zip(result_manual, result_values):
    # Use assert with a small tolerance
    assert abs(manual - computed) < 1e-6, f"Mismatch: Manual={manual}, Computed={computed}"

#### Session

`test_single_session_data`

In [None]:
num_data_per_session = session.groupby(["PATIENT_ID", "SESSION_ID"])["SESSION_ID"].count().unique()
# Check only one dm per timepoint in Patient Sessions
assert len(num_data_per_session) == 1
assert num_data_per_session[0] == 1

`test_session_compute_adherence_ewma`

In [None]:
# Manual result
values = (
    session[session["PATIENT_ID"] == test_patient][session["PROTOCOL_ID"] == test_protocol]["ADHERENCE"].tolist()
)
result_session_manual = manual_ewma(values, processor.alpha)

# Processor result
session_ewma_adherence = processor.compute_ewma(session, "ADHERENCE", ["PATIENT_ID", "PROTOCOL_ID"])
result_session_values = (
    session_ewma_adherence[session_ewma_adherence["PATIENT_ID"] == test_patient][session_ewma_adherence["PROTOCOL_ID"] == test_protocol]["ADHERENCE"].tolist()
)

# Compare manually computed and function-computed EWMA values using assert
for manual, computed in zip(result_session_manual, result_session_values):
    # Use assert with a small tolerance
    assert abs(manual - computed) < 1e-6, f"Mismatch: Manual={manual}, Computed={computed}"

`test_session_usage`

In [6]:
usage = session.groupby(["PATIENT_ID", "PROTOCOL_ID"]).SESSION_ID.count()
processor_usage = processor.process_data(session_data=session, timeseries_data=timeseries, ppf_data=ppf)

In [7]:
merge_test = processor.merge_session_and_timeseries(session_data=session, timeseries_data=timeseries)

In [8]:
agg_test = processor.aggregate_metrics_per_protocol(merge_test)

In [None]:
agg_test

In [9]:
data_test = ppf.merge(agg_test, on=["PATIENT_ID", "PROTOCOL_ID"], how="left")


In [10]:
data_test.dtypes

PATIENT_ID       int64
PROTOCOL_ID      int64
PPF            float64
CONTRIB         object
ADHERENCE      float64
DM_VALUE       float64
PE_VALUE       float64
USAGE            Int64
DAYS            object
dtype: object

In [11]:
data_test

Unnamed: 0,PATIENT_ID,PROTOCOL_ID,PPF,CONTRIB,ADHERENCE,DM_VALUE,PE_VALUE,USAGE,DAYS
0,775,214,0.617411,"[0.014909525935815035, 0.04969841978605012, 0....",0.609462,0.949463,0.934906,5.0,[2]
1,775,223,0.626448,"[0.01426645761043579, 0.04755485870145264, 0.1...",,,,,
2,775,208,0.484665,"[0.02088597029033065, 0.13923980193553767, 0.0...",0.674691,0.99497,0.974205,13.0,"[0, 2, 4]"
3,775,204,0.659929,"[0.013422880045535941, 0.13422880045535943, 0....",,,,,
4,775,205,0.596419,"[0.01698382609953423, 0.1698382609953423, 0.0,...",,,,,
5,775,219,0.631371,"[0.016784244244057927, 0.16784244244057928, 0....",,,,,
6,775,209,0.667707,"[0.01326523652661333, 0.13265236526613333, 0.0...",0.384762,,,3.0,[]
7,775,206,0.528646,"[0.013972368519644753, 0.13972368519644754, 0....",0.770777,1.0,0.832768,12.0,"[0, 4]"
8,775,226,0.655305,"[0.012880010466263369, 0.12880010466263372, 0....",,,,,
9,775,221,0.648561,"[0.020376302023104097, 0.20376302023104098, 0....",,,,,


In [None]:
# Compute ewma adherence
session_test = (
    session
    .sort_values(by=["PATIENT_ID", "PROTOCOL_ID", "SESSION_ID"]) # Sort df by protocol, session
    .pipe(processor.compute_ewma, value_col="ADHERENCE", group_cols=["PATIENT_ID", "PROTOCOL_ID"]) # Compute Adherence EWMA
    .pipe(lambda df: df.assign(
        USAGE=df.groupby(["PATIENT_ID", "PROTOCOL_ID"])["SESSION_ID"].transform("nunique")
    ))
)

`test_session_timeseries_merge`

In [None]:
# Extract unique SESSION_ID values and sort them
session_unique = session["SESSION_ID"].drop_duplicates().sort_values().reset_index(drop=True)
timeseries_unique = timeseries["SESSION_ID"].drop_duplicates().sort_values().reset_index(drop=True)

try:
    # Compare the two Series
    pd.testing.assert_series_equal(session_unique, timeseries_unique, check_names=False)
    print("All unique SESSION_ID values match!")

except AssertionError as e:
    print("Differences found in unique SESSION_ID values:")
    
    # Find mismatched values
    mismatched_session = session_unique[~session_unique.isin(timeseries_unique)]
    mismatched_timeseries = timeseries_unique[~timeseries_unique.isin(session_unique)]
    
    # Print mismatched values
    if not mismatched_session.empty:
        print("Values in `session_ewma_adherence` but not in `timeseries_dms_agg_ewma`:")
        print(mismatched_session.tolist())
    
    if not mismatched_timeseries.empty:
        print("Values in `timeseries_dms_agg_ewma` but not in `session_ewma_adherence`:")
        print(mismatched_timeseries.tolist())
    
    # Raise the original error for further debugging
    raise e