In [5]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from ai_cdss.data_loader import DataLoader
from ai_cdss.data_processor import DataProcessor

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
rgs_mode = "app"
scoring_weights = [1,1,1]
ewma_alpha = 0.5

In [None]:
patient_list = [775]

In [8]:
loader = DataLoader(rgs_mode=rgs_mode)
processor = DataProcessor(weights=scoring_weights, alpha=ewma_alpha)

In [9]:
session = loader.load_session_data(patient_list=patient_list)
timeseries = loader.load_timeseries_data(patient_list=patient_list)

Database engine created successfully


INFO:ai_cdss.data_loader:Session data loaded successfully.


Database engine closed
Database engine created successfully
Database engine closed
Database engine created successfully
Database engine closed


#### Timeseries

`test_timeseries_dms_aggregation`

In [10]:
timeseries_dms_agg = processor.aggregate_dms_by_time(timeseries)

num_dms_timepoint = set(timeseries_dms_agg.groupby(["PATIENT_ID", "SESSION_ID", "PROTOCOL_ID", "SECONDS_FROM_START"])["DM_KEY"].count().values)

# Check only one dm per timepoint in Patient Sessions
assert len(num_dms_timepoint) == 1
assert len(num_dms_timepoint - {1}) == 0 

`test_ewma_computation`

In [11]:
def manual_ewma(values, alpha):
    """Manually compute EWMA with adjust=True."""
    ewma_values = []
    for t in range(len(values)):
        # Calculate weights
        weights = [(1 - alpha) ** i for i in range(t + 1)]
        # Calculate weighted sum
        weighted_sum = sum(w * x for w, x in zip(weights, reversed(values[:t + 1])))
        # Calculate EWMA
        ewma = weighted_sum / sum(weights)
        ewma_values.append(ewma)
    return ewma_values

# Test subset
test_patient = timeseries_dms_agg.PATIENT_ID.unique()[0]
test_protocol = timeseries_dms_agg.PROTOCOL_ID.unique()[0]

# Manual result
values = (
    timeseries_dms_agg[timeseries_dms_agg["PATIENT_ID"] == test_patient][timeseries_dms_agg["PROTOCOL_ID"] == test_protocol]["DM_VALUE"].tolist()
)
result_manual = manual_ewma(values, processor.alpha)

# Processor result
timeseries_dms_agg_ewma = processor.compute_ewma(timeseries_dms_agg, "DM_VALUE", ["PATIENT_ID", "PROTOCOL_ID"])
result_values = (
    timeseries_dms_agg_ewma[timeseries_dms_agg_ewma["PATIENT_ID"] == test_patient][timeseries_dms_agg_ewma["PROTOCOL_ID"] == test_protocol]["DM_VALUE"].tolist()
)

# Compare manually computed and function-computed EWMA values using assert
for manual, computed in zip(result_manual, result_values):
    # Use assert with a small tolerance
    assert abs(manual - computed) < 1e-6, f"Mismatch: Manual={manual}, Computed={computed}"

#### Session

`test_single_session_data`

In [12]:
num_data_per_session = session.groupby(["PATIENT_ID", "SESSION_ID"])["SESSION_ID"].count().unique()
# Check only one dm per timepoint in Patient Sessions
assert len(num_data_per_session) == 1
assert num_data_per_session[0] == 1

`test_session_compute_adherence_ewma`

In [13]:
# Manual result
values = (
    session[session["PATIENT_ID"] == test_patient][session["PROTOCOL_ID"] == test_protocol]["ADHERENCE"].tolist()
)
result_session_manual = manual_ewma(values, processor.alpha)

# Processor result
session_ewma_adherence = processor.compute_ewma(session, "ADHERENCE", ["PATIENT_ID", "PROTOCOL_ID"])
result_session_values = (
    session_ewma_adherence[session_ewma_adherence["PATIENT_ID"] == test_patient][session_ewma_adherence["PROTOCOL_ID"] == test_protocol]["ADHERENCE"].tolist()
)

# Compare manually computed and function-computed EWMA values using assert
for manual, computed in zip(result_session_manual, result_session_values):
    # Use assert with a small tolerance
    assert abs(manual - computed) < 1e-6, f"Mismatch: Manual={manual}, Computed={computed}"

`test_session_timeseries_merge`

In [15]:
# Extract unique SESSION_ID values and sort them
session_unique = session["SESSION_ID"].drop_duplicates().sort_values().reset_index(drop=True)
timeseries_unique = timeseries["SESSION_ID"].drop_duplicates().sort_values().reset_index(drop=True)

try:
    # Compare the two Series
    pd.testing.assert_series_equal(session_unique, timeseries_unique, check_names=False)
    print("All unique SESSION_ID values match!")

except AssertionError as e:
    print("Differences found in unique SESSION_ID values:")
    
    # Find mismatched values
    mismatched_session = session_unique[~session_unique.isin(timeseries_unique)]
    mismatched_timeseries = timeseries_unique[~timeseries_unique.isin(session_unique)]
    
    # Print mismatched values
    if not mismatched_session.empty:
        print("Values in `session_ewma_adherence` but not in `timeseries_dms_agg_ewma`:")
        print(mismatched_session.tolist())
    
    if not mismatched_timeseries.empty:
        print("Values in `timeseries_dms_agg_ewma` but not in `session_ewma_adherence`:")
        print(mismatched_timeseries.tolist())
    
    # Raise the original error for further debugging
    raise e

Differences found in unique SESSION_ID values:
Values in `session_ewma_adherence` but not in `timeseries_dms_agg_ewma`:
[16796, 16800, 16802, 17119, 17631, 17633, 17635, 17970, 17975, 19051, 19908, 19917, 21948, 21959, 21965, 22434, 22438, 22880, 22882, 23231, 24225, 24230, 24236, 24631]


AssertionError: Series are different

Series length are different
[left]:  62, RangeIndex(start=0, stop=62, step=1)
[right]: 38, RangeIndex(start=0, stop=38, step=1)