In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import math
import warnings
warnings.filterwarnings('ignore')

## Get the dataframes

In [2]:
df1 = pd.read_csv('/home/jupyter/charliemacuject/pharma_reports/data/DME/devchau.csv')
df2 = pd.read_csv('/home/jupyter/charliemacuject/pharma_reports/data/DME/devchau_bor.csv')
df3 = pd.read_csv('/home/jupyter/charliemacuject/pharma_reports/data/DME/ericmayer.csv')

In [3]:
df1['location'] = 'box_hill'
df2['location'] = 'boronia'
df3['location'] = 'box_hill'

In [4]:
df1.drop(columns=['Unnamed: 0'], inplace=True)
df1.head()

Unnamed: 0,id,CreatedDate,Laterality,visual_acuity,InjToday,InjNext,NextInt,location
0,0,22/02/2017,Left,76.0,,,,box_hill
1,0,09/03/2017,Left,76.0,Avastin,,,box_hill
2,0,22/03/2017,Left,76.0,,,,box_hill
3,0,11/04/2017,Left,76.0,Lucentis,Lucentis,4.0,box_hill
4,0,09/05/2017,Left,76.0,Lucentis,Lucentis,4.0,box_hill


In [5]:
df2.drop(columns=['Unnamed: 0'], inplace=True)
df2.head()

Unnamed: 0,id,CreatedDate,Laterality,visual_acuity,InjToday,InjNext,NextInt,location
0,0,09/11/2020,Left,76.0,,,,boronia
1,0,09/11/2020,Left,,,,,boronia
2,0,08/12/2020,Left,76.0,,,,boronia
3,0,05/01/2021,Left,76.0,,,,boronia
4,0,28/01/2021,Left,76.0,,,,boronia


In [6]:
df3.drop(columns=['Unnamed: 0.1', 'Unnamed: 0', 'Unnamed: 7', 'Unnamed: 8'], inplace=True)
df3.head()

Unnamed: 0,Unnamed: 0.1.1,id,CreatedDate,Laterality,visual_acuity,InjToday,InjNext,NextInt,location
0,0,0,2016-10-18,Left,89.0,,,,box_hill
1,1,0,2017-01-24,Left,94.0,,,,box_hill
2,2,0,2017-07-17,Left,89.0,,,,box_hill
3,3,0,2018-06-14,Left,89.0,,,,box_hill
4,4,0,2019-02-07,Left,94.0,,,,box_hill


## Clean up time values

In [7]:
def time_sort(df):
    id_list = df.id.unique()
    frames = []
    for eye in id_list:
        pdf = df[df.id == eye]
        pdf.CreatedDate = pd.to_datetime(pdf.CreatedDate, dayfirst=True)
        pdf.sort_values(by='CreatedDate', inplace=True)
        frames.append(pdf)
    return pd.concat(frames)

In [8]:
df1 = time_sort(df1)
df1.head()

Unnamed: 0,id,CreatedDate,Laterality,visual_acuity,InjToday,InjNext,NextInt,location
0,0,2017-02-22,Left,76.0,,,,box_hill
1,0,2017-03-09,Left,76.0,Avastin,,,box_hill
2,0,2017-03-22,Left,76.0,,,,box_hill
3,0,2017-04-11,Left,76.0,Lucentis,Lucentis,4.0,box_hill
4,0,2017-05-09,Left,76.0,Lucentis,Lucentis,4.0,box_hill


In [9]:
df2 = time_sort(df2)
df2.head()

Unnamed: 0,id,CreatedDate,Laterality,visual_acuity,InjToday,InjNext,NextInt,location
0,0,2020-11-09,Left,76.0,,,,boronia
1,0,2020-11-09,Left,,,,,boronia
2,0,2020-12-08,Left,76.0,,,,boronia
3,0,2021-01-05,Left,76.0,,,,boronia
4,0,2021-01-28,Left,76.0,,,,boronia


In [10]:
import datetime

def year_cutoff(df, year):
    id_list = df.id.unique()
    frames = []
    for eye in id_list:
        pdf = df[df.id == eye]
        days = year * 365
        toAdd = datetime.timedelta(days=days)
        start = pdf.CreatedDate.iloc[0]
        cutoff = start + toAdd
        if pdf.CreatedDate.iloc[-1] >= cutoff:
            pdf = pdf[pdf.CreatedDate <= cutoff]
            frames.append(pdf)
    return pd.concat(frames)

## Fill in missing values

In [11]:
df1.InjToday.fillna(method="ffill", inplace=True)
df1.head()

Unnamed: 0,id,CreatedDate,Laterality,visual_acuity,InjToday,InjNext,NextInt,location
0,0,2017-02-22,Left,76.0,,,,box_hill
1,0,2017-03-09,Left,76.0,Avastin,,,box_hill
2,0,2017-03-22,Left,76.0,Avastin,,,box_hill
3,0,2017-04-11,Left,76.0,Lucentis,Lucentis,4.0,box_hill
4,0,2017-05-09,Left,76.0,Lucentis,Lucentis,4.0,box_hill


In [12]:
df2.visual_acuity.fillna(method="ffill", inplace=True)
df2.InjToday.fillna(method="ffill", inplace=True)
df2.InjNext.fillna(method="ffill", inplace=True)
df2.NextInt.fillna(method="ffill", inplace=True)

In [13]:
df2.isna().sum()

id               0
CreatedDate      0
Laterality       0
visual_acuity    0
InjToday         7
InjNext          7
NextInt          7
location         0
dtype: int64

In [14]:
df1.visual_acuity.fillna(method="ffill", inplace=True)
df1.InjToday.fillna(method="ffill", inplace=True)
df1.InjNext.fillna(method="ffill", inplace=True)
df1.NextInt.fillna(method="ffill", inplace=True)

In [15]:
df1.isna().sum()

id               0
CreatedDate      0
Laterality       0
visual_acuity    0
InjToday         1
InjNext          3
NextInt          3
location         0
dtype: int64

In [16]:
df3.visual_acuity.fillna(method="ffill", inplace=True)
df3.InjToday.fillna(method="ffill", inplace=True)
df3.InjNext.fillna(method="ffill", inplace=True)
df3.NextInt.fillna(method="ffill", inplace=True)

In [17]:
df3.isna().sum()

Unnamed: 0.1.1     0
id                 0
CreatedDate        0
Laterality         0
visual_acuity      0
InjToday          11
InjNext           11
NextInt           11
location           0
dtype: int64

## Combine all dataframes together

In [18]:
df2.id = df2.id + 500
df3.id = df3.id + 1000

In [19]:
frames = [df1, df2, df3]
master = pd.concat(frames)
len(df1) + len(df2) + len(df3)

8904

In [20]:
len(master)

8904

In [22]:
master.drop(columns=['Unnamed: 0.1.1'], inplace=True)
master.head()

Unnamed: 0,id,CreatedDate,Laterality,visual_acuity,InjToday,InjNext,NextInt,location
0,0,2017-02-22 00:00:00,Left,76.0,,,,box_hill
1,0,2017-03-09 00:00:00,Left,76.0,Avastin,,,box_hill
2,0,2017-03-22 00:00:00,Left,76.0,Avastin,,,box_hill
3,0,2017-04-11 00:00:00,Left,76.0,Lucentis,Lucentis,4.0,box_hill
4,0,2017-05-09 00:00:00,Left,76.0,Lucentis,Lucentis,4.0,box_hill


In [25]:
master.to_csv('/home/jupyter/charliemacuject/research_papers/dme/input/df.csv')

## Feature engineering

In [26]:
class VisualOutcomes:
    
    def vision_list(self, df):
        """
        Returns a list of visual acuity for patient.
        Input: patient Pandas dataframe.
        Output: list.
        """
        df['CreatedDate'] = pd.to_datetime(df['CreatedDate'])
        df.sort_values(by=['CreatedDate'], inplace=True)
        lst = df['visual_acuity'].dropna()
        return lst.to_list()
    
    def mean_vision(self, df):
        """
        Returns the mean vision of a patient.
        Input: patient Pandas dataframe.
        Output: float (LogMAR letters).
        """
        lst = self.vision_list(df)
        return np.mean(lst)

    def loss_from_peak(self, df):
        """
        Returns the VLP for a patient.
        Vision Loss from Peak (VLP) is defined as max vision minus last vision.
        Input: Pandas dataframe.
        Output: float (LogMAR letters).
        """
        visions = self.vision_list(df)
        return max(visions) - visions[-1]
    
    def overall_visual_change(self, df):
        """
        Returns the OVC for a patient.
        Overall Visual Change (OVC) is defined as last vision minus first vision.
        Input: Pandas dataframe.
        Output: float (LogMAR letters).
        """
        visions = self.vision_list(df)
        last = (visions[-1] + visions[-2] + visions[-3]) / 3
        first = visions[0]
        return last - first
    
    def peak_visual_improvement(self, df):
        """
        Returns the PVI for a patient.
        Peak Visual Improvement (PVI) is defined as max vision minus initial vision.
        Input: Pandas dataframe.
        Output: float (LogMAR letters).
        """
        visions = self.vision_list(df)
        return max(visions) - visions[0]

    def proportion_above_baseline(self, df):
        """
        Returns the proportion of time above starting vision.
        Input: patient Pandas dataframe.
        Output: float (percentage).
        """
        lst = self.vision_list(df)
        starting_vision = lst[0]
        above_lst = [i for i in lst if i > starting_vision]
        if len(above_lst) != 0:
            mean = sum(above_lst) / len(above_lst)
        else:
            mean = 0
        return mean

    def patient_clean(self, df, number_years):
        """
        Shortens a patient's dataframe to x years after initiation.
        Input: patient Pandas dataframe, integer.
        Output: Pandas dataframe.
        """
        dates = df['CreatedDate'].to_list()
        first = dates[0]
        cutoff = first.replace(year = first.year + number_years)
        df = df[df['CreatedDate'] < cutoff]
        return df

    def time_above_baseline(self, df, number_years):
        """
        Returns the number of days a patient spent above baseline in first x years.
        Input: Pandas dataframe, integer.
        Output: integer (days).
        """
        df["CreatedDate"] = pd.to_datetime(df["CreatedDate"])
        df.sort_values(by=['CreatedDate'], inplace=True)
        dates = df["CreatedDate"].to_list()
        if (dates[-1] - dates[0]).days > (number_years * 365):
            df = patient_clean(df, number_years)
            vision = df['visual_acuity'].to_list()
            dates2 = df["CreatedDate"].to_list()
            days = 0
            starting_vision = vision[0]
            for i in range(1, len(vision)):
                if vision[i] > vision[0]:
                    between = (dates2[i] - dates2[i-1]).days
                    days += between
            return days
        else:
            return 'nil'
        
    def time_above_baseline2(self, df):
        """
        Returns the number of days a patient spent above baseline in first x years.
        Input: Pandas dataframe, integer.
        Output: integer (days).
        """
        df["CreatedDate"] = pd.to_datetime(df["CreatedDate"])
        df.sort_values(by=['CreatedDate'], inplace=True)
        dates = df["CreatedDate"].to_list()
        vision = df['visual_acuity'].to_list()
        dates2 = df["CreatedDate"].to_list()
        days = 0
        starting_vision = vision[0]
        for i in range(1, len(vision)):
            if vision[i] > vision[0]:
                between = (dates2[i] - dates2[i-1]).days
                days += between
        return days

    def time_to_peak(self, df):
        """
        Returns the TPVI for a patient.
        Time to Peak Visual Improvement (TPVI) is defined in days.
        Input: Pandas dataframe.
        Output: integer (days).
        """
        df["CreatedDate"] = pd.to_datetime(df["CreatedDate"])
        df.sort_values(by=['CreatedDate'], inplace=True)
        dates = df["CreatedDate"].to_list()
        max_value = df['visual_acuity'].max()
        df_fin = df[df['visual_acuity'] == max_value]
        initial_date = dates[0]
        final_date = df_fin.CreatedDate.iloc[0]
        return (final_date - initial_date).days
    
    def baseline_vision(self, df):
        """
        Returns the baseline vision for a patient.
        Input: Pandas dataframe.
        Output: integer (LogMAR letters).
        """
        lst = self.vision_list(df)
        return lst[0]
    
    def time_since_starting(self, df):
        """
        Returns the days since starting treatment, as of final visit.
        """
        dates = df["CreatedDate"].to_list()
        initial_date = dates[0]
        final_date = dates[-1]
        return (final_date - initial_date).days
    
    def location(self, df):
        """
        Returns the location of the clinic.
        """
        df.dropna(subset=['location'], inplace=True)
        return df.location.iloc[0]

In [29]:
class Dataframe(VisualOutcomes):
    
    def get_df(self):
        """
        Returns the dataframe to be analysed (all visits).
        """
        df = pd.read_csv('/home/jupyter/charliemacuject/research_papers/dme/input/df.csv')
        df.drop(columns=['Unnamed: 0'], inplace=True)
        df.dropna(inplace=True)
        return df
    
    def dataframe_gen(self, pdf, pat_id):
        """
        Returns a dataframe of all adherence measures and visual outcomes.
        For a singular patient only (will be one row).
        Input: integer (patient id).
        Output: Pandas dataframe.
        """
        df = pdf[pdf["id"] == pat_id]
        data = {'mean_vision': [VisualOutcomes.mean_vision(self, df)], 
                'time_above_baseline': [VisualOutcomes.time_above_baseline2(self, df)],
                'peak_visual_improvement': [VisualOutcomes.peak_visual_improvement(self, df)],
                'overall_visual_change': [VisualOutcomes.overall_visual_change(self, df)],
                'time_to_peak': [VisualOutcomes.time_to_peak(self, df)],
                'baseline': [VisualOutcomes.baseline_vision(self, df)],
                'visits': [len(df)], 
                'time_since_starting': [VisualOutcomes.time_since_starting(self, df)],
                'location': [VisualOutcomes.location(self, df)]}
        return pd.DataFrame(data)

    
    def master_dataframe(self):
        """
        Returns a dataframe of statics for all patients.
        """
        df = self.get_df()
        id_list = df["id"].unique()
        frames = []
        for i in range(len(id_list)):
            try:
                pdf = self.dataframe_gen(df, id_list[i])
                pdf['id'] = i
                frames.append(pdf)
            except:
                i += 1
        master = pd.concat(frames)
        master.reset_index(inplace=True)
        master.drop(columns=['index'], inplace=True)
        return master

In [30]:
obj = Dataframe()
df = obj.master_dataframe()

In [31]:
df.head()

Unnamed: 0,mean_vision,time_above_baseline,peak_visual_improvement,overall_visual_change,time_to_peak,baseline,visits,time_since_starting,location,id
0,70.064516,0,0.0,-9.333333,0,76.0,31,1532,box_hill,0
1,71.0625,0,0.0,-31.0,0,85.0,16,1156,box_hill,1
2,72.5625,0,0.0,-21.0,0,85.0,16,1156,box_hill,2
3,77.448276,2036,20.0,15.0,987,65.0,29,2036,box_hill,3
4,73.689655,225,4.0,4.0,1829,76.0,29,2036,box_hill,4


In [32]:
len(df)

321

In [33]:
def label_ovc(row): return 1 if row['overall_visual_change'] > 0 else 0

In [34]:
df['outcome'] = df.apply(lambda row: label_ovc(row), axis=1)

In [35]:
df.head()

Unnamed: 0,mean_vision,time_above_baseline,peak_visual_improvement,overall_visual_change,time_to_peak,baseline,visits,time_since_starting,location,id,outcome
0,70.064516,0,0.0,-9.333333,0,76.0,31,1532,box_hill,0,0
1,71.0625,0,0.0,-31.0,0,85.0,16,1156,box_hill,1,0
2,72.5625,0,0.0,-21.0,0,85.0,16,1156,box_hill,2,0
3,77.448276,2036,20.0,15.0,987,65.0,29,2036,box_hill,3,1
4,73.689655,225,4.0,4.0,1829,76.0,29,2036,box_hill,4,1


In [36]:
df.to_csv('/home/jupyter/charliemacuject/research_papers/dme/input/df_features.csv')

## Create train and test sets

In [37]:
df = pd.read_csv('/home/jupyter/charliemacuject/research_papers/dme/input/df_features.csv')

We use `sample` with `frac=1` to shuffle the dataframe. We reset the indices since they change after shuffling the dataframe.

In [38]:
df = df.sample(frac=1).reset_index(drop=True)

In [39]:
len(df)

321

The first 250 rows are selected for training, with the remaining 71 for the test set.

In [40]:
df_train = df.head(250)
df_test = df.tail(71)

In [41]:
df_train.drop(columns=['overall_visual_change'], inplace=True)
df_test.drop(columns=['overall_visual_change'], inplace=True)

In [42]:
df_train.to_csv('/home/jupyter/charliemacuject/research_papers/dme/input/df_train.csv')
df_test.to_csv('/home/jupyter/charliemacuject/research_papers/dme/input/df_test.csv')

In [43]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,mean_vision,time_above_baseline,peak_visual_improvement,time_to_peak,baseline,visits,time_since_starting,location,id,outcome
0,231,76.56,959,10.0,232,70.0,25,959,boronia,239,1
1,191,77.027397,0,0.0,0,94.0,73,2812,boronia,199,0
2,75,74.0,0,0.0,0,76.0,6,593,box_hill,78,0
3,265,67.947368,793,9.0,756,76.0,38,1813,box_hill,273,1
4,68,66.818182,0,0.0,0,70.0,11,304,box_hill,71,0
