In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
from scipy.integrate import quad

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('/home/jupyter/charliemacuject/research_papers/data/master_macuject.csv', low_memory=False)

In [3]:
max(df['ExamLeftGraph'])

1.5

In [4]:
df.head()

Unnamed: 0,Patient::PatientSurname,Patient::PatientFirstname,Patient::PatientBCID,Patient::PatientGender,Patient::PatientDOB,Current age in 2020,CreatedDate,JobClinic,DoctorName,Patient::PatientHistorySum,...,InjectionRightType,InjectionLeftType,InjectionRightTypeNextTime,InjectionNextTimeRight,InjectionLeftTypeNextTime,InjectionNextTimeLeft,InjectionAdministeredRight,InjectionAdministeredLeft,BC_RightInjEF,BC_LEFTInjEF
0,,,4221,Female,,100,24/04/2007,,Dr. Devinder Chauhan,,...,,,,,,,,,,
1,,,4221,Female,,100,04/05/2007,,Dr. Devinder Chauhan,,...,,Lucentis,,,Lucentis,,,,,
2,,,4221,Female,,100,01/06/2007,,Dr. Devinder Chauhan,,...,,Lucentis,,,Lucentis,,,,,
3,,,4221,Female,,100,20/07/2007,,Dr. Devinder Chauhan,,...,,Lucentis,,,Lucentis,,,,,
4,,,4221,Female,,100,17/08/2007,,Dr. Devinder Chauhan,,...,,,,,,,,,,


In [5]:
def drop_less(tdf):
    count_left = tdf["InjectionNextTimeLeft"].isna().sum()
    count_right = tdf["InjectionNextTimeRight"].isna().sum()
    if count_left > count_right:
        tdf = tdf.drop(columns=['ExamLeftGraph', 'InjectionLeftType', 'InjectionNextTimeLeft'])
        tdf = tdf.rename(columns={"ExamRightGraph": "ExamGraph", "InjectionRightType":"InjectionType", "InjectionNextTimeRight":"NextTime"})
    elif count_left < count_right:
        tdf = tdf.drop(columns=['ExamRightGraph', 'InjectionRightType', 'InjectionNextTimeRight'])
        tdf = tdf.rename(columns={"ExamLeftGraph": "ExamGraph", "InjectionLeftType":"InjectionType", "InjectionNextTimeLeft":"NextTime"})
    else:
        tdf = tdf.drop(columns=['ExamRightGraph', 'InjectionRightType', 'InjectionNextTimeRight'])
        tdf = tdf.rename(columns={"ExamLeftGraph": "ExamGraph", "InjectionLeftType":"InjectionType", "InjectionNextTimeLeft":"NextTime"})
    tdf['ExamGraph'] = np.where(tdf['ExamGraph'] > 1.0, 1.0, tdf['ExamGraph'])
    return tdf

# function to get days since first and append it as a column
def days_since_first(df):
    df['CreatedDate'] = pd.to_datetime(df['CreatedDate'], dayfirst=True)
    days_list = [0]
    date0 = df['CreatedDate'].iloc[0]
    for i in range(1, len(df)):
        date1 = df['CreatedDate'].iloc[i]
        diff = date1 - date0
        diff = diff.days
        days_list.append(diff)
    df['DaysFirst'] = days_list
    return df

# function to get actual time between injections
def days_between(df):
    df['CreatedDate'] = pd.to_datetime(df['CreatedDate'], dayfirst=True)
    df.sort_values(by=['CreatedDate'], inplace=True)
    time_list = ["NaN"]
    for i in range(len(df)-1):
        date0 = df['CreatedDate'].iloc[i]
        date1 = df['CreatedDate'].iloc[i+1]
        diff = date1 - date0
        diff = round(diff.days / 7, 2)
        time_list.append(diff)
    df['actual_time'] = time_list
    return df

# function to get the adherence factor for any specific visit
def adherence_factor(df):
    adherence_list = ["NaN"]
    for i in range(len(df)-1):
        rec = df['NextTime'].iloc[i]
        actual = df['actual_time'].iloc[i+1]
        adherence = round(actual / rec, 2)
        adherence_list.append(adherence)
    df['adherence_factor'] = adherence_list
    return df

# function to find the running mean adherence over all previous appointments
def running_adherence(df):
    running_list = []
    for i in range(len(df)):
        mean_list = df['adherence_factor'].iloc[0:i].dropna().tolist()
        try:
            mean_list.remove("NaN")
        except:
            pass
        try:
            total = sum(mean_list)
            length = len(mean_list)
            mean = round(total / length, 2)
            running_list.append(mean)
        except:
            running_list.append("NaN")
            i += 1
    df['running_ad'] = running_list
    return df

# function to find the running standard deviation over all previous appointments
def std_dev(df):
    dev_list = []
    for i in range(len(df)):
        std_list = df['adherence_factor'].iloc[0:i].dropna().tolist()
        try:
            std_list.remove("NaN")
        except:
            pass
        try:
            dev_list.append(round(np.std(std_list), 4))
        except:
            latest = dev_list[-1]
            dev_list.append(latest)
            i += 1
    df['st_dev'] = dev_list
    return df

# function to get average adherence over last five visits
def ad_five(df):
    running_list = []
    for i in range(len(df)):
        if i < 4:
            mean_list = df['adherence_factor'].iloc[0:i].dropna().tolist()
        else:
            mean_list = df['adherence_factor'].iloc[(i-5):i].dropna().tolist()
        try:
            mean_list.remove("NaN")
        except:
            pass
        try:
            total = sum(mean_list)
            length = len(mean_list)
            mean = round(total / length, 2)
            running_list.append(mean)
        except:
            running_list.append("NaN")
            i += 1
    df['ad_five'] = running_list
    return df

# function to get previous vision score
def prev_vision(df):
    vision_list = []
    for i in range(len(df)):
        if i == 0:
            vision_list.append(df['ExamGraph'].iloc[i])
        else:
            vision_list.append(df['ExamGraph'].iloc[i-1])
    df['prev_vision'] = vision_list
    return df

# function to get average vision score
def mean_vision(df):
    running_list = []
    for i in range(len(df)):
        mean_list = df['ExamGraph'].iloc[0:i].dropna().tolist()
        try:
            mean_list.remove("NaN")
        except:
            pass
        try:
            total = sum(mean_list)
            length = len(mean_list)
            mean = round(total / length, 2)
            running_list.append(mean)
        except:
            running_list.append("NaN")
            i += 1
    df['mean_vision'] = running_list
    return df

# function to get average vision from last five visits
def last_five(df):
    running_list = []
    for i in range(len(df)):
        if i < 4:
            mean_list = df['ExamGraph'].iloc[0:i].dropna().tolist()
        else:
            mean_list = df['ExamGraph'].iloc[(i-5):i].dropna().tolist()
        try:
            mean_list.remove("NaN")
        except:
            pass
        try:
            total = sum(mean_list)
            length = len(mean_list)
            mean = round(total / length, 2)
            running_list.append(mean)
        except:
            running_list.append("NaN")
            i += 1
    df['last_five'] = running_list
    return df

# function to get average vision from last 10 visits
def last_ten(df):
    running_list = []
    for i in range(len(df)):
        if i < 9:
            mean_list = df['ExamGraph'].iloc[0:i].dropna().tolist()
        else:
            mean_list = df['ExamGraph'].iloc[(i-10):i].dropna().tolist()
        try:
            mean_list.remove("NaN")
        except:
            pass
        try:
            total = sum(mean_list)
            length = len(mean_list)
            mean = round(total / length, 2)
            running_list.append(mean)
        except:
            running_list.append("NaN")
            i += 1
    df['last_ten'] = running_list
    return df

# function to get average vision from last 3 visits
def last_three(df):
    running_list = []
    for i in range(len(df)):
        if i < 2:
            mean_list = df['ExamGraph'].iloc[0:i].dropna().tolist()
        else:
            mean_list = df['ExamGraph'].iloc[(i-3):i].dropna().tolist()
        try:
            mean_list.remove("NaN")
        except:
            pass
        try:
            total = sum(mean_list)
            length = len(mean_list)
            mean = round(total / length, 2)
            running_list.append(mean)
        except:
            running_list.append("NaN")
            i += 1
    df['last_three'] = running_list
    return df

# function to find the running standard deviation over all previous appointments
def std_vision(df):
    dev_list = []
    for i in range(len(df)):
        std_list = df['ExamGraph'].iloc[0:i].dropna().tolist()
        try:
            std_list.remove("NaN")
        except:
            pass
        try:
            dev_list.append(round(np.std(std_list), 4))
        except:
            latest = dev_list[-1]
            dev_list.append(latest)
            i += 1
    df['std_vision'] = dev_list
    return df

# function to get average vision from last five visits
def std_five(df):
    running_list = []
    for i in range(len(df)):
        if i < 4:
            mean_list = df['ExamGraph'].iloc[0:i].dropna().tolist()
        else:
            mean_list = df['ExamGraph'].iloc[(i-5):i].dropna().tolist()
        try:
            mean_list.remove("NaN")
        except:
            pass
        try:
            running_list.append(round(np.std(mean_list), 4))
        except:
            running_list.append("NaN")
            i += 1
    df['std_five'] = running_list
    return df

# function to only keep patients if they started after 2016
def patient_clean(df):
    items = pd.to_datetime(df['CreatedDate'])
    cutoff = pd.to_datetime("2016-01-01")
    return all(visit > cutoff for visit in items)

# function to filter dataframe based on cutoff
def cutoff_filter():
    df = pd.read_csv('/home/jupyter/charliemacuject/research_papers/data/master_macuject.csv')
    df = df.rename(columns={"Patient::PatientBCID": "ID", "Patient::PatientGender": "gender", "Current age in 2020": "age"})
    id_list = df["ID"].unique()
    frames = []
    for i in range(len(id_list)):
        pdf = df[df["ID"] == id_list[i]]
        if patient_clean(pdf) == True:
            frames.append(pdf)
        else:
            i += 1
    master = pd.concat(frames)
    return master

In [6]:
master = cutoff_filter()
master.head()

Unnamed: 0,Patient::PatientSurname,Patient::PatientFirstname,ID,gender,Patient::PatientDOB,age,CreatedDate,JobClinic,DoctorName,Patient::PatientHistorySum,...,InjectionRightType,InjectionLeftType,InjectionRightTypeNextTime,InjectionNextTimeRight,InjectionLeftTypeNextTime,InjectionNextTimeLeft,InjectionAdministeredRight,InjectionAdministeredLeft,BC_RightInjEF,BC_LEFTInjEF
115,,,4222,Male,,91,25/01/2017,Box Hill,Dr. Devinder Chauhan,,...,,,,,,,,,LUCENTIS,
116,,,4222,Male,,91,21/02/2017,Box Hill,Dr. Devinder Chauhan,,...,Lucentis,,Lucentis,4.0,,,Yes,,LUCENTIS,
117,,,4222,Male,,91,28/03/2017,Box Hill,Dr. Devinder Chauhan,,...,Lucentis,,Lucentis,6.0,,,Yes,,LUCENTIS,
118,,,4222,Male,,91,09/05/2017,Box Hill,Dr. Devinder Chauhan,,...,Lucentis,,Lucentis,8.0,,,Yes,,LUCENTIS,
119,,,4222,Male,,91,08/08/2017,Box Hill,Dr. Devinder Chauhan,,...,Lucentis,,Lucentis,12.0,,,Yes,,LUCENTIS,


In [7]:
len(df), len(master)

(56641, 11081)

In [8]:
master = cutoff_filter()

# get patient IDs
id_list = master["ID"].unique()
id_list

def dataframe_gen(pat_id):
    df = master
    df = df[df["ID"] == pat_id]
    df = drop_less(df)
    df = days_since_first(df)
    df = days_between(df)
    df = adherence_factor(df)
    df = running_adherence(df)
    df = std_dev(df)
    df = prev_vision(df)
    df = mean_vision(df)
    df = std_vision(df)
    return df

In [9]:
df1 = dataframe_gen(id_list[0])
df2 = dataframe_gen(id_list[1])
frames = [df1, df2]
result = pd.concat(frames)
result

Unnamed: 0,Patient::PatientSurname,Patient::PatientFirstname,ID,gender,Patient::PatientDOB,age,CreatedDate,JobClinic,DoctorName,Patient::PatientHistorySum,...,BC_RightInjEF,BC_LEFTInjEF,DaysFirst,actual_time,adherence_factor,running_ad,st_dev,prev_vision,mean_vision,std_vision
115,,,4222,Male,,91,2017-01-25,Box Hill,Dr. Devinder Chauhan,,...,LUCENTIS,,0,,,,,0.166667,,
116,,,4222,Male,,91,2017-02-21,Box Hill,Dr. Devinder Chauhan,,...,LUCENTIS,,27,3.86,,,,0.166667,0.17,0.0
117,,,4222,Male,,91,2017-03-28,Box Hill,Dr. Devinder Chauhan,,...,LUCENTIS,,62,5.0,1.25,,,0.4,0.28,0.1167
118,,,4222,Male,,91,2017-05-09,Box Hill,Dr. Devinder Chauhan,,...,LUCENTIS,,104,6.0,1.0,1.25,0.0,0.333333,0.3,0.0981
119,,,4222,Male,,91,2017-08-08,Box Hill,Dr. Devinder Chauhan,,...,LUCENTIS,,195,13.0,1.62,1.12,0.125,0.333333,0.31,0.0862
120,,,4222,Male,,91,2017-10-31,Box Hill,Dr. Devinder Chauhan,,...,LUCENTIS,,279,12.0,1.0,1.29,0.2547,0.333333,0.31,0.0777
121,,,4222,Male,,91,2018-01-23,Box Hill,Dr. Devinder Chauhan,,...,LUCENTIS,,363,12.0,1.0,1.22,0.2538,0.333333,0.32,0.0714
122,,,4222,Male,,91,2018-04-24,Box Hill,Dr. Devinder Chauhan,,...,LUCENTIS,,454,13.0,1.08,1.17,0.2431,0.333333,0.32,0.0663
123,,,4222,Male,,91,2018-07-24,Box Hill,Dr. Devinder Chauhan (BOR),,...,LUCENTIS,,545,13.0,1.08,1.16,0.2247,0.333333,0.32,0.0622
124,,,4222,Male,,91,2018-11-13,Box Hill,Dr. Devinder Chauhan (BOR),,...,LUCENTIS,,657,16.0,1.33,1.15,0.2098,0.25,0.31,0.0627


In [10]:
def master_dataframe():
    df = cutoff_filter()
    # get patient IDs
    id_list = df["ID"].unique()
    frames = []
    for i in range(len(id_list)):
        try:
            pdf = dataframe_gen(id_list[i])
            pdf = pdf.drop(columns=['ID'])
            pdf['ID'] = i
            frames.append(pdf)
        except:
            i += 1
    master = pd.concat(frames)
    return master

master = master_dataframe()
len(master)

11081

In [11]:
master.columns

Index(['Patient::PatientSurname', 'Patient::PatientFirstname', 'gender',
       'Patient::PatientDOB', 'age', 'CreatedDate', 'JobClinic', 'DoctorName',
       'Patient::PatientHistorySum', 'Patient::DiagRightCalc',
       'Patient::DiagLeftCalc', 'InjectionPhaseRight', 'InjectionPhaseLeft',
       'InjectionRightSymptom', 'InjectionLeftSymptom', 'ExamRightUaCalc',
       'ExamRightGlCalc', 'ExamRightPhCalc', 'ExamRightIop', 'ExamGraph',
       'ExamLeftUaCalc', 'ExamLeftGlCalc', 'ExamLeftPhCalc', 'ExamLeftIop',
       'InjectionRightFindingsCalc', 'InjectionLeftFindingsCalc',
       'InjectionRightMacFindingHaem', 'InjectionLeftMacFindingHaem',
       'InjectionType', 'InjectionRightTypeNextTime', 'NextTime',
       'InjectionLeftTypeNextTime', 'InjectionAdministeredRight',
       'InjectionAdministeredLeft', 'BC_RightInjEF', 'BC_LEFTInjEF',
       'DaysFirst', 'actual_time', 'adherence_factor', 'running_ad', 'st_dev',
       'prev_vision', 'mean_vision', 'std_vision', 'ID'],
      dty

In [12]:
master.drop(columns=['Patient::PatientSurname', 'Patient::PatientFirstname',
       'Patient::PatientDOB', 'JobClinic', 'DoctorName',
       'Patient::PatientHistorySum', 'Patient::DiagRightCalc',
       'Patient::DiagLeftCalc', 'InjectionPhaseRight', 'InjectionPhaseLeft',
       'InjectionRightSymptom', 'InjectionLeftSymptom', 'ExamRightUaCalc',
       'ExamRightGlCalc', 'ExamRightPhCalc', 'ExamRightIop',
       'ExamLeftUaCalc', 'ExamLeftGlCalc', 'ExamLeftPhCalc', 'ExamLeftIop',
       'InjectionRightFindingsCalc', 'InjectionLeftFindingsCalc',
       'InjectionRightMacFindingHaem', 'InjectionLeftMacFindingHaem',
       'InjectionRightTypeNextTime',
       'InjectionLeftTypeNextTime', 'InjectionAdministeredRight',
       'InjectionAdministeredLeft', 'BC_RightInjEF', 'BC_LEFTInjEF'], inplace=True)

In [13]:
master.dropna(inplace=True)

In [14]:
master

Unnamed: 0,gender,age,CreatedDate,ExamGraph,InjectionType,NextTime,DaysFirst,actual_time,adherence_factor,running_ad,st_dev,prev_vision,mean_vision,std_vision,ID
118,Male,91,2017-05-09,0.333333,Lucentis,8.0,104,6.0,1.0,1.25,0.0000,0.333333,0.3,0.0981,0
119,Male,91,2017-08-08,0.333333,Lucentis,12.0,195,13.0,1.62,1.12,0.1250,0.333333,0.31,0.0862,0
120,Male,91,2017-10-31,0.333333,Lucentis,12.0,279,12.0,1.0,1.29,0.2547,0.333333,0.31,0.0777,0
121,Male,91,2018-01-23,0.333333,Lucentis,12.0,363,12.0,1.0,1.22,0.2538,0.333333,0.32,0.0714,0
122,Male,91,2018-04-24,0.333333,Lucentis,12.0,454,13.0,1.08,1.17,0.2431,0.333333,0.32,0.0663,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56399,Female,96,2017-06-13,0.200000,Eylea,8.0,102,6.0,1.0,1.07,0.1800,0.200000,0.19,0.0157,506
56484,Female,57,2020-01-07,0.333333,Lucentis,4.0,203,4.0,1.0,1.0,0.0700,0.333333,0.38,0.0601,507
56485,Female,57,2020-02-13,0.333333,Lucentis,6.0,240,5.29,1.32,1.0,0.0572,0.333333,0.38,0.0583,507
56488,Female,57,2020-06-16,0.500000,Lucentis,8.0,364,6.0,1.0,1.05,0.1415,0.500000,0.4,0.0722,507


In [15]:
max(master['ExamGraph'])

1.0

In [16]:
master.to_csv('/home/jupyter/charliemacuject/research_papers/data/master_allstats.csv')