In [None]:
!pip install pandas numpy matplotlib seaborn scikit-learn statsmodels interpret


Collecting interpret
  Downloading interpret-0.7.4-py3-none-any.whl.metadata (1.2 kB)
Collecting interpret-core==0.7.4 (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.7.4->interpret)
  Downloading interpret_core-0.7.4-py3-none-any.whl.metadata (3.0 kB)
Collecting SALib>=1.3.3 (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.7.4->interpret)
  Downloading salib-1.5.2-py3-none-any.whl.metadata (11 kB)
Collecting aplr>=10.6.1 (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.7.4->interpret)
  Downloading aplr-10.20.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting dash<3.0.0,>=2.0.0 (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.7.4->interpret)
  Downloading dash-2.18.2-py3-none-any.whl.metadata (10 kB)
Collecting dash-cytoscape>=0.1.1 (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.7.4->i

This notebook implements a complete data pipeline, forecasting models, and explainability analysis for patient mobility data.

In [None]:
import json
import pandas as pd

with open("timeseries-data.json") as f:
    ts_data = json.load(f)

with open("categorical-data.json") as f:
    clinical_data = json.load(f)

print(len(ts_data))
print(clinical_data.keys())


80919
dict_keys(['gender', 'isSmoker', 'birthYear', 'birthMonth', 'disease', 'diagnoses', 'events', 'sideEffects', 'therapies', 'molecularAnalysisSelection'])


In [None]:
ts_df = pd.DataFrame(ts_data)
ts_df["start"] = pd.to_datetime(ts_df["start"])
ts_df["end"] = pd.to_datetime(ts_df["end"])


In [None]:
ts_df["date"] = ts_df["start"].dt.date

daily_steps = (
    ts_df.groupby("date")["count"]
    .sum()
    .reset_index()
    .rename(columns={"count": "Daily_Step_Count"})
)

daily_steps["date"] = pd.to_datetime(daily_steps["date"])
daily_steps.head()


Unnamed: 0,date,Daily_Step_Count
0,2021-10-10,19980
1,2021-10-11,11229
2,2021-10-12,15117
3,2021-10-13,13527
4,2021-10-14,17781


In [None]:
full_dates = pd.date_range(
    start=daily_steps["date"].min(),
    end=daily_steps["date"].max(),
    freq="D"
)

daily_steps = (
    daily_steps
    .set_index("date")
    .reindex(full_dates)
    .fillna(0)
    .rename_axis("date")
    .reset_index()
)


In [None]:
gender = clinical_data["gender"]
birth_year = clinical_data["birthYear"]
disease = clinical_data["disease"]

daily_steps["age"] = 2025 - birth_year
daily_steps["gender_M"] = 1 if gender == "M" else 0
daily_steps["gender_F"] = 1 if gender == "F" else 0


In [None]:
therapies = clinical_data.get("therapies", [])

daily_steps["date"] = pd.to_datetime(daily_steps["date"]).dt.tz_localize(None)

for i, therapy in enumerate(therapies):
    col = f"is_on_therapy_{i+1}"
    daily_steps[col] = 0

    start = pd.to_datetime(therapy["startDate"]).tz_localize(None)

    # ✅ FIX: handle None endDate
    if therapy["endDate"] is None:
        end = daily_steps["date"].max()
    else:
        end = pd.to_datetime(therapy["endDate"]).tz_localize(None)

    daily_steps.loc[
        (daily_steps["date"] >= start) &
        (daily_steps["date"] <= end),
        col
    ] = 1




In [None]:
side_effects = clinical_data.get("sideEffects", [])

daily_steps["active_side_effect_count"] = 0
daily_steps["max_side_effect_intensity"] = 0

for se in side_effects:
    start = pd.to_datetime(se["startDate"], utc=True).tz_localize(None)

    # handle None endDate
    if se["endDate"] is None:
        end = daily_steps["date"].max()
    else:
        end = pd.to_datetime(se["endDate"], utc=True).tz_localize(None)

    intensity = se.get("intensity", 0)

    mask = (daily_steps["date"] >= start) & (daily_steps["date"] <= end)

    daily_steps.loc[mask, "active_side_effect_count"] += 1
    daily_steps.loc[mask, "max_side_effect_intensity"] = (
        daily_steps.loc[mask, "max_side_effect_intensity"]
        .combine(pd.Series(intensity, index=daily_steps.loc[mask].index), max)
    )


In [None]:
daily_steps["day_of_week"] = daily_steps["date"].dt.dayofweek
daily_steps["week_of_year"] = daily_steps["date"].dt.isocalendar().week.astype(int)

daily_steps["steps_t-1"] = daily_steps["Daily_Step_Count"].shift(1)
daily_steps["steps_t-7"] = daily_steps["Daily_Step_Count"].shift(7)
daily_steps["steps_t-30"] = daily_steps["Daily_Step_Count"].shift(30)

daily_steps = daily_steps.dropna()


In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

train = daily_steps.iloc[:-365]
test = daily_steps.iloc[-365:]

model = SARIMAX(
    train["Daily_Step_Count"],
    order=(1,1,1),
    seasonal_order=(1,1,1,7)
)

result = model.fit(disp=False)
forecast = result.forecast(steps=365)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

rmse = np.sqrt(mean_squared_error(test["Daily_Step_Count"], forecast))
mae = mean_absolute_error(test["Daily_Step_Count"], forecast)

rmse, mae


(np.float64(11011.935702481822), 7575.646458316153)

In [None]:
from interpret.glassbox import ExplainableBoostingRegressor

features = daily_steps.drop(columns=["date", "Daily_Step_Count"])
target = daily_steps["Daily_Step_Count"]

X_train = features.iloc[:-365]
X_test = features.iloc[-365:]
y_train = target.iloc[:-365]
y_test = target.iloc[-365:]

ebm = ExplainableBoostingRegressor(random_state=42)
ebm.fit(X_train, y_train)

preds = ebm.predict(X_test)


In [None]:
rmse_ebm = np.sqrt(mean_squared_error(y_test, preds))
mae_ebm = mean_absolute_error(y_test, preds)

rmse_ebm, mae_ebm


(np.float64(7828.485636844886), 5600.727763052091)

In [None]:
from interpret import show

show(ebm.explain_global())


In [None]:
future_dates = pd.date_range(
    daily_steps["date"].max() + pd.Timedelta(days=1),
    periods=365
)

forecast_df = pd.DataFrame({
    "Date": future_dates,
    "Predicted_Steps": preds[:365],
    "Trend_Component": forecast.values[:365],
    "Exogenous_Impact": preds[:365] - forecast.values[:365]
})

forecast_df.head()


Unnamed: 0,Date,Predicted_Steps,Trend_Component,Exogenous_Impact
0,2025-10-22,13305.365502,11119.428283,2185.937219
1,2025-10-23,24171.488832,8701.861315,15469.627516
2,2025-10-24,11346.239378,8464.099222,2882.140155
3,2025-10-25,8884.053209,7283.220542,1600.832667
4,2025-10-26,10016.092976,6265.118822,3750.974154


In [None]:
def upload_forecast_to_s3(df, bucket_name, file_name):
    import boto3
    import io

    # s3 = boto3.client("s3")
    # csv_buffer = io.StringIO()
    # df.to_csv(csv_buffer, index=False)
    # s3.put_object(Bucket=bucket_name, Key=file_name, Body=csv_buffer.getvalue())

    pass
