In [3]:
import pandas as pd
import numpy as np
from datetime import timedelta  
import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv("~/Documents/github/paper/input/data.csv", low_memory=False)
df.drop(columns=['next_visit_date', 'is_submacular_blood_present',
       'created_at', 'updated_at', 'is_floaters_present',
       'is_irf_gt_initiation', 'is_srf_gt_initiation',
       'is_ped_gt_last_visit', 'is_ped_gt_initiation',
       'is_submacular_blood_new', 'is_vision_worse_than_last_visit',
       'drug_id', 'next_drug_id',
       'next_recommendation_eye_state', 'message',
       'has_submacular_blood_resolved', 'should_extend', 'doctor_visit_type',
       'doctor_visits_id', 'irf_srf_ped_lt_previous_drug', 'resume_treatment',
       'is_atypical', 'status_from_silverpond',
       'silverpond_original_image_url', 'silverpond_overlay_image_url',
       'is_oct_correct', 'is_irf_gt_initiation_prediction',
       'is_srf_gt_initiation_prediction', 'is_recommendation_changed',
       'recommended_drug_today', 'recommended_drug_Next_time',
       'recommended_Next_interval',
       'actual_drug_Next_time', 'actual_Next_interval', 'clinic_id', 'user_id',
       'creation_mode',
       'high_res_silverpond_original_image_url',
       'high_res_silverpond_overlay_image_url',
       'clinic_visit_type', 'planned_drug_id', 'planned_interval_in_weeks',
       'completed', 'treatment_reason', 'reason_bilat', 'reason_pt',
       'reason_fluid', 'reason_only_eye', 'oct_attached_at',
       'fluid_measured_at', 'patched', 'oct_inference_job_id', 'irf_focal',
       'irf_diffuse', 'reason_va', 'bscan_original_image_url',
       'bscan_overlay_image_url', 'fundus_original_image_url',
       'fundus_overlay_image_url'], inplace=True)
df.head()

Unnamed: 0,id,eye_id,is_irf_present,is_srf_present,next_interval_in_weeks,admission_date,actual_drug_today,visual_acuity,irf,srf,ur,laterality
0,38431,1829,,,,23/6/20,nil,85.0,,,18363,Left
1,38432,1830,,,12.0,23/6/20,Eylea,70.0,,,18363,Right
2,38435,1787,,,6.0,23/6/20,Eylea,40.0,,,ers21736,Left
3,72546,11936,False,True,4.0,20/1/21,Lucentis,60.0,0.0,119.933,ers27377,Left
4,105265,1985,False,True,5.0,11/1/22,Lucentis,76.0,0.0,119.799,ers22077,Left


In [5]:
df[df.is_irf_present == False]

Unnamed: 0,id,eye_id,is_irf_present,is_srf_present,next_interval_in_weeks,admission_date,actual_drug_today,visual_acuity,irf,srf,ur,laterality
3,72546,11936,False,True,4.0,20/1/21,Lucentis,60.0,0.000,119.933,ers27377,Left
4,105265,1985,False,True,5.0,11/1/22,Lucentis,76.0,0.000,119.799,ers22077,Left
17,50217,1832,False,False,9.0,25/8/20,Eylea,70.0,339.421,0.000,ERS12592,Right
26,50221,630,False,True,10.0,25/8/20,Lucentis,75.0,0.000,4.544,ers14916,Right
27,58512,2945,False,True,4.0,17/9/20,Lucentis,55.0,,,20168,Left
...,...,...,...,...,...,...,...,...,...,...,...,...
31375,94370,1811,False,True,4.0,12/10/21,Eylea,70.0,0.000,9.128,ERS17950,Left
31376,94367,2046,False,True,4.0,12/10/21,Eylea,76.0,0.000,148.408,ers25323,Right
31377,94372,2119,False,False,4.0,12/10/21,Eylea,61.0,0.000,87.730,ers24475,Left
31379,94373,1862,False,False,4.0,12/10/21,Lucentis,76.0,0.000,0.000,ERS16656,Right


In [6]:
len(df.eye_id.unique())

910

In [7]:
len(df.ur.unique())

456

In [8]:
def id_cleaner(df):
    id_list = df.eye_id.unique()
    frames, i = [], 0
    for eye in id_list:
        pdf = df[df.eye_id == eye]
        pdf_left = pdf[pdf.laterality == 'Left']
        pdf_right = pdf[pdf.laterality == 'Right']
        pdf_left.eye_id = i
        pdf_right.eye_id = i + 1
        if len(pdf_left) != 0:
            frames.append(pdf_left)
            i += 1
        if len(pdf_right) != 0:
            frames.append(pdf_right)
            i += 1
    return pd.concat(frames)

def time_sort(df):
    id_list = df.eye_id.unique()
    frames = []
    for eye in id_list:
        pdf = df[df.eye_id == eye]
        pdf.admission_date = pd.to_datetime(pdf.admission_date, dayfirst=True)
        pdf.sort_values(by='admission_date', inplace=True)
        frames.append(pdf)
    return pd.concat(frames)

In [9]:
cleaned_df = time_sort(id_cleaner(df))
cleaned_df.reset_index(inplace=True, drop=True)
cleaned_df.drop(columns=['next_interval_in_weeks', 'ur', 'laterality', 'id'], inplace=True)
cleaned_df.rename(columns={"eye_id": "id"}, inplace=True)
cleaned_df.replace(['nil', np.nan], inplace=True)
cleaned_df["actual_drug_today"].replace({"nil": np.nan}, inplace=True)
cleaned_df.head()

Unnamed: 0,id,is_irf_present,is_srf_present,admission_date,actual_drug_today,visual_acuity,irf,srf
0,0,,,2014-03-04,,89.0,,
1,0,,,2014-03-28,,85.0,,
2,0,,,2014-04-24,,85.0,,
3,0,,,2014-05-16,,94.0,,
4,0,,,2014-06-06,,94.0,,


In [10]:
cleaned_df.to_csv("raw_data_cleaned.csv", index=False)

# Raw to aggregated

In [11]:
df = pd.read_csv("raw_data_cleaned.csv")
df.drop(columns=['actual_drug_today'], inplace=True)
df.head()

Unnamed: 0,id,is_irf_present,is_srf_present,admission_date,visual_acuity,irf,srf
0,0,,,2014-03-04,89.0,,
1,0,,,2014-03-28,85.0,,
2,0,,,2014-04-24,85.0,,
3,0,,,2014-05-16,94.0,,
4,0,,,2014-06-06,94.0,,


In [12]:
def patient_cutoff(df, cutoff_year, cutoff_visits):
    # patients must have this many years of data to be included.
    frames = []
    id_list = df.id.unique()
    for eye in id_list:
        pdf = df[df.id == eye]
        dates = (pd.to_datetime(pdf.admission_date)).to_list()
        if ((dates[-1] - dates[0]).days)/365 >= cutoff_year and len(pdf)>=cutoff_visits: 
            frames.append(pdf)
    return pd.concat(frames)

def cut_time(df, cutoff_time):
        # shortens a patient's dataframe to x years after initiation.
        frames = []
        id_list = df.id.unique()
        for eye in id_list:
            pdf = df[df.id == eye]
            pdf.admission_date = pd.to_datetime(pdf.admission_date)
            dates = pdf['admission_date'].to_list()
            first = pd.to_datetime(dates[0])
            cutoff = first + timedelta(days=cutoff_time*365)
            pdf = pdf[pdf['admission_date'] <= cutoff]
            #to_append = pd.concat([pdf.iloc[0:4], pdf.iloc[-1]])
            frames.append(pdf)
        return pd.concat(frames)

In [13]:
def impute_pdf(df):
    fill_NaN = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    imputed_df = pd.DataFrame(fill_NaN.fit_transform(df))
    imputed_df.columns = df.columns
    imputed_df.index = df.index
    imputed_df.fillna(0, inplace=True)
    return imputed_df

def reshape_pdf(pdf):
    nums, columns = [], ['first_va', 'irf_1', 'srf_1', 'second_va', 
                         'irf_2', 'srf_2', 'third_va', 'irf_3', 'srf_3', 
                         'fourth_va', 'irf_4', 'srf_4', 'mean_vision', 'std_vision',
                         'target_va']
    pdf.fillna(0, inplace=True)
    for i in range(4): 
        nums.append(pdf.visual_acuity.iloc[i])
        nums.append(pdf.irf.iloc[i])
        nums.append(pdf.srf.iloc[i])
    nums.append(np.mean(pdf.visual_acuity))
    nums.append(np.std(pdf.visual_acuity))
    nums.append(pdf.visual_acuity.iloc[-1])
    return pd.DataFrame(data=[nums], columns=columns)

def reshape_df(df):
    eyes = df.id.unique()
    frames = []
    for eye in eyes:
        pdf = df[df.id == eye]
        try: frames.append(reshape_pdf(pdf))
        except: pass
    return pd.concat(frames)

In [16]:
def save_df_patients(n_years):
    df = pd.read_csv("raw_data_cleaned.csv")
    df.drop(columns=['actual_drug_today'], inplace=True)
    df = patient_cutoff(df, n_years, 4)
    df = cut_time(df, n_years)
    df.drop(columns=['admission_date'], inplace=True)
    df = reshape_df(df)
    df.to_csv(f"df_{n_years}_years.csv", index=False)

In [17]:
save_df_patients(1)
save_df_patients(2)
save_df_patients(3)