# Evaluation

In [50]:
import pandas as pd
from datetime import timedelta 
import numpy as np
import config
import warnings
warnings.filterwarnings('ignore')

## Preprocessing

In [30]:
df = pd.read_csv("~/Documents/github/paper/input/raw_test_data_cleaned.csv")
df.rename(columns={'id': 'eye_id', 'Drug': 'actual_drug_today'}, inplace=True)
df.head()

Unnamed: 0,eye_id,admission_date,visual_acuity,actual_drug_today,InjNext,next_interval_in_weeks,laterality,Interval
0,1,2019-08-10,89.0,,,,Right,0.0
1,1,2019-08-10,89.0,Lucentis,Lucentis,4.0,Right,0.0
2,1,2019-10-29,85.0,Lucentis,Lucentis,4.0,Right,11.0
3,1,2019-11-26,85.0,Lucentis,Lucentis,6.0,Right,4.0
4,1,2020-01-14,85.0,Lucentis,Lucentis,7.0,Right,7.0


In [34]:
def id_cleaner(df):
    id_list = df['eye_id'].unique()
    frames, i = [], 0
    for eye in id_list:
        pdf = df[df.eye_id == eye]
        pdf_left = pdf[pdf.laterality == 'Left']
        pdf_right = pdf[pdf.laterality == 'Right']
        pdf_left.eye_id = i
        pdf_right.eye_id = i + 1
        if len(pdf_left) != 0:
            frames.append(pdf_left)
            i += 1
        if len(pdf_right) != 0:
            frames.append(pdf_right)
            i += 1
    return pd.concat(frames)

def time_sort(df):
    id_list = df.eye_id.unique()
    frames = []
    for eye in id_list:
        pdf = df[df.eye_id == eye]
        pdf.admission_date = pd.to_datetime(pdf.admission_date, dayfirst=True)
        pdf.sort_values(by='admission_date', inplace=True)
        frames.append(pdf)
    return pd.concat(frames)

def raw_data(df):
    cleaned_df = time_sort(id_cleaner(df))
    cleaned_df.reset_index(inplace=True, drop=True)
    #cleaned_df.drop(columns=['next_interval_in_weeks', 'ur', 'laterality', 'id'], inplace=True)
    #cleaned_df.rename(columns={"eye_id": "id"}, inplace=True)
    cleaned_df.replace(['nil', np.nan], inplace=True)
    cleaned_df["actual_drug_today"].replace({"nil": np.nan}, inplace=True)
    cleaned_df.head()
    return cleaned_df

In [35]:
cleaned_df = raw_data(df)

In [36]:
cleaned_df

Unnamed: 0,eye_id,admission_date,visual_acuity,actual_drug_today,InjNext,next_interval_in_weeks,laterality,Interval
0,1,2019-08-10,89.0,,,,Right,0.0
1,1,2019-08-10,89.0,Lucentis,Lucentis,4.0,Right,0.0
2,1,2019-10-29,85.0,Lucentis,Lucentis,4.0,Right,11.0
3,1,2019-11-26,85.0,Lucentis,Lucentis,6.0,Right,4.0
4,1,2020-01-14,85.0,Lucentis,Lucentis,7.0,Right,7.0
...,...,...,...,...,...,...,...,...
5944,248,2020-10-27,76.0,Lucentis,Lucentis,2.0,Right,2.0
5945,248,2020-12-08,76.0,Lucentis,Lucentis,10.0,Right,6.0
5946,248,2020-12-15,80.0,Lucentis,Lucentis,12.0,Right,1.0
5947,248,2021-07-20,80.0,Lucentis,Lucentis,12.0,Right,31.0


In [64]:
def patient_cutoff(df, cutoff_year, cutoff_visits):
    # patients must have this many years of data to be included.
    frames = []
    id_list = df.eye_id.unique()
    for eye in id_list:
        pdf = df[df.eye_id == eye]
        dates = (pd.to_datetime(pdf.admission_date)).to_list()
        if ((dates[-1] - dates[0]).days)/365 >= cutoff_year and len(pdf)>=cutoff_visits: 
            frames.append(pdf)
    return pd.concat(frames)

def cut_time(df, cutoff_time):
        # shortens a patient's dataframe to x years after initiation.
        frames = []
        id_list = df.eye_id.unique()
        for eye in id_list:
            pdf = df[df.eye_id == eye]
            pdf.admission_date = pd.to_datetime(pdf.admission_date)
            dates = pdf['admission_date'].to_list()
            first = pd.to_datetime(dates[0])
            cutoff = first + timedelta(days=cutoff_time*365)
            pdf = pdf[pdf['admission_date'] <= cutoff]
            if len(pdf) > 0: frames.append(pdf)
        return pd.concat(frames)
    
def impute_pdf(df):
    fill_NaN = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    imputed_df = pd.DataFrame(fill_NaN.fit_transform(df))
    imputed_df.columns = df.columns
    imputed_df.index = df.index
    imputed_df.fillna(0, inplace=True)
    return imputed_df

def column_names(i):
    return [f'va_{i}', f'int_{i}']

def column_builder(i):
    lst = []
    for visits in range(1, i+1):
        lst.extend(column_names(visits))
    lst.append('mean_vision'), lst.append('std_vision')
    lst.append('target_va')
    lst.remove('int_1')
    return lst

def reshape_pdf(pdf, n_visits):
    nums, columns = [], column_builder(n_visits)
    pdf.fillna(0, inplace=True)
    for i in range(n_visits): 
        nums.append(pdf.visual_acuity.iloc[i])
        if i != 0: nums.append((pdf.admission_date.iloc[i] - pdf.admission_date.iloc[i-1]).days)
    if n_visits > 6: nums.append(np.mean(pdf.visual_acuity))
    else: nums.append(np.mean(pdf.visual_acuity.iloc[:n_visits+1]))
    if n_visits > 3: nums.append(np.std(pdf.visual_acuity))
    else: nums.append(np.std(pdf.visual_acuity.iloc[:n_visits+1]))
    #mode = statistics.mode(pdf.actual_drug_today)
    #if type(mode) == str: nums.append(mode)
    #else: nums.append("None")
    #nums.append(pdf.age.iloc[0])
    #nums.append(encode_gender(pdf.gender.iloc[0]))
    nums.append(pdf.visual_acuity.iloc[-1])
    return pd.DataFrame(data=[nums], columns=columns)

def encode_gender(g):
    return 0 if g == "Male" else 1

def reshape_df(df, n_visits):
    eyes = df.eye_id.unique()
    frames = []
    for eye in eyes:
        pdf = df[df.eye_id == eye]
        try: frames.append(reshape_pdf(pdf, n_visits))
        except: pass
    return pd.concat(frames)

def save_df_patients(n_years, n_visits=4, test=False):
    if test:
        df = pd.read_csv("~/Documents/github/paper/input/raw_test_data_cleaned.csv")
        df.drop(columns=['actual_drug_today', 'next_interval_in_weeks', 'InjNext',
                 'laterality'], inplace=True)
        df["irf"] = 0
        df["srf"] = 0
    else: 
        df = pd.read_csv("~/Documents/github/paper/input/raw_test_data_cleaned.csv")
    df = patient_cutoff(df, n_years, 4)
    df = cut_time(df, n_years)
    df = reshape_df(df, n_visits)
    #df = pd.get_dummies(df, columns=["mode_drug"])
    #df.drop(columns=['admission_date'], inplace=True)
    #if test: df.to_csv(f"test_{n_years}_years.csv", index=False)
    #else: df.to_csv(f"df_{n_years}_years_{n_visits}_visits.csv", index=False)
    if test: return df

In [68]:
test_df = save_df_patients(3, test=True)
test_df.reset_index(drop=True, inplace=True)
test_df.head()

Unnamed: 0,va_1,va_2,int_2,va_3,int_3,va_4,int_4,mean_vision,std_vision,target_va
0,76.0,76.0,0,70.0,20,70.0,32,72.4,18.865636,20.0
1,76.0,80.0,13,80.0,0,85.0,0,81.2,3.088828,76.0
2,85.0,94.0,24,89.0,17,89.0,0,89.2,3.802923,94.0
3,80.0,85.0,24,85.0,17,61.0,0,80.0,5.119136,94.0
4,80.0,76.0,0,80.0,22,80.0,2,79.2,3.593535,76.0


## Train model on training data

In [78]:
train_df = pd.read_csv(config.TRAINING_FILE[-1])
features = ['va_1', 'va_2', 'int_2', 'va_3', 'int_3', 'va_4', 'int_4', 
            'mean_vision', 'std_vision', 'target_va']
train_df = train_df[features]
train_df.head()

Unnamed: 0,va_1,va_2,int_2,va_3,int_3,va_4,int_4,mean_vision,std_vision,target_va
0,89.0,85.0,24,85.0,27,94.0,22,88.25,3.62498,94.0
1,80.0,76.0,24,50.0,27,46.0,22,63.0,8.558216,70.0
2,55.0,70.0,29,76.0,28,70.0,28,67.75,4.446186,75.0
3,80.0,85.0,36,85.0,42,89.0,56,84.75,3.180237,80.0
4,76.0,65.0,36,65.0,42,65.0,56,67.75,12.222955,61.0


In [79]:
from sklearn import ensemble
from sklearn import linear_model
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.tab_model import TabNetRegressor
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

tabnet_params = {"optimizer_fn":torch.optim.Adam,
                 "verbose":0,
                 "optimizer_params":dict(lr=2e-2),
                 "scheduler_params":{"step_size":50, # how to use learning rate scheduler
                                 "gamma":0.9},
                 "scheduler_fn":torch.optim.lr_scheduler.StepLR,
                 "mask_type":'entmax' # "sparsemax"
                }

In [82]:
import joblib
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import config
import os
import argparse
import model_dispatcher

clf = TabNetRegressor(**tabnet_params)

def score(model, X, y, cv=5, scoring='neg_mean_squared_error'):
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    return np.mean(scores), np.std(scores)

def run(df, clf):
    # create inputs and targets
    X, y = df.drop(columns=['target_va']).values, df.target_va.values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)
    # score the model
    kfold_tabnet(clf, X, y.reshape(-1, 1))

def kfold_tabnet(clf, X, y):
    rmses = []
    for i in range(5):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        rmse = fit_tabnet(clf, X_train, y_train, X_test, y_test)
        rmses.append(rmse)
    final_rmse, rmse_std = np.round(np.mean(rmses), 2), np.round(np.std(rmses), 2)
    print(f"RMSE: mean={final_rmse}, std={rmse_std}")
    
def fit_tabnet(clf, X_train, y_train, X_test, y_test):
    clf.fit(X_train, y_train, eval_set=[(X_test, y_test)],
            eval_metric=['rmse'], patience=1000, max_epochs=10000)
    preds = clf.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    return rmse

In [83]:
run(train_df, clf)


Early stopping occurred at epoch 1760 with best_epoch = 760 and best_val_0_rmse = 14.85445
Best weights from best epoch are automatically used!

Early stopping occurred at epoch 1018 with best_epoch = 18 and best_val_0_rmse = 8.86549
Best weights from best epoch are automatically used!

Early stopping occurred at epoch 1000 with best_epoch = 0 and best_val_0_rmse = 9.35031
Best weights from best epoch are automatically used!

Early stopping occurred at epoch 1000 with best_epoch = 0 and best_val_0_rmse = 9.87928
Best weights from best epoch are automatically used!

Early stopping occurred at epoch 1003 with best_epoch = 3 and best_val_0_rmse = 7.26034
Best weights from best epoch are automatically used!
RMSE: mean=10.04, std=2.56


## Evaluating on test set

In [84]:
clf

TabNetRegressor(n_d=8, n_a=8, n_steps=3, gamma=1.3, cat_idxs=[], cat_dims=[], cat_emb_dim=1, n_independent=2, n_shared=2, epsilon=1e-15, momentum=0.02, lambda_sparse=0.001, seed=0, clip_value=1, verbose=0, optimizer_fn=<class 'torch.optim.adam.Adam'>, optimizer_params={'lr': 0.02}, scheduler_fn=<class 'torch.optim.lr_scheduler.StepLR'>, scheduler_params={'step_size': 50, 'gamma': 0.9}, mask_type='entmax', input_dim=9, output_dim=1, device_name='auto')

In [None]:
def evaluate(df_test, clf):
    # create inputs and targets
    X, y = df.drop(columns=['target_va']).values, df.target_va.values
    