In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import math
import warnings
warnings.filterwarnings('ignore')

In [78]:
df = pd.read_csv('/home/jupyter/charliemacuject/research_papers/dme/input/df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,id,CreatedDate,Laterality,visual_acuity,InjToday,InjNext,NextInt,location
0,0,0,2017-02-22 00:00:00,Left,76.0,,,,box_hill
1,1,0,2017-03-09 00:00:00,Left,76.0,Avastin,,,box_hill
2,2,0,2017-03-22 00:00:00,Left,76.0,Avastin,,,box_hill
3,3,0,2017-04-11 00:00:00,Left,76.0,Lucentis,Lucentis,4.0,box_hill
4,4,0,2017-05-09 00:00:00,Left,76.0,Lucentis,Lucentis,4.0,box_hill


We don't want to cheat, so we want a dataset where the features are formed from the first three visits only. Let's get a dataset where each patient has at least two years of data. We also require the patient to have had at least 8 visits in those two years.

In [2]:
import datetime

def year_cutoff(df, year, visits):
    id_list = df.id.unique()
    frames = []
    for eye in id_list:
        pdf = df[df.id == eye]
        pdf.CreatedDate = pd.to_datetime(pdf.CreatedDate)
        start = pd.to_datetime(pdf.CreatedDate.iloc[0])
        cutoff = start + pd.offsets.DateOffset(years=year)
        if pd.to_datetime(pdf.CreatedDate.iloc[-1]) >= cutoff:
            pdf = pdf[pdf.CreatedDate <= cutoff]
            if len(pdf) >= visits:
                frames.append(pdf)
    return pd.concat(frames)

In [98]:
len(df), len(df_new)

(8904, 3168)

In [99]:
df_new.to_csv('/home/jupyter/charliemacuject/research_papers/dme/input/df_one_year.csv')

We now want the features from the first three visits. Let's see if we can come up with some useful features.

In [3]:
class VisualOutcomes:
    
    def vision_list(self, df):
        """
        Returns a list of visual acuity for patient.
        Input: patient Pandas dataframe.
        Output: list.
        """
        df['CreatedDate'] = pd.to_datetime(df['CreatedDate'])
        df.sort_values(by=['CreatedDate'], inplace=True)
        lst = df['visual_acuity'].dropna()
        return lst.to_list()
    
    def mean_vision(self, df):
        """
        Returns the mean vision of a patient.
        Input: patient Pandas dataframe.
        Output: float (LogMAR letters).
        """
        lst = self.vision_list(df)
        return np.mean(lst)
    
    def std_vision(self, df):
        """
        Returns the standard deviation of vision.
        """
        lst = self.vision_list(df)
        return np.std(lst)

    def loss_from_peak(self, df):
        """
        Returns the VLP for a patient.
        Vision Loss from Peak (VLP) is defined as max vision minus last vision.
        Input: Pandas dataframe.
        Output: float (LogMAR letters).
        """
        visions = self.vision_list(df)
        return max(visions) - visions[-1]
    
    def overall_visual_change(self, df):
        """
        Returns the OVC for a patient.
        Overall Visual Change (OVC) is defined as last vision minus first vision.
        Input: Pandas dataframe.
        Output: float (LogMAR letters).
        """
        visions = self.vision_list(df)
        last = (visions[-1] + visions[-2] + visions[-3]) / 3
        first = visions[0]
        return last - first
    
    def peak_visual_improvement(self, df):
        """
        Returns the PVI for a patient.
        Peak Visual Improvement (PVI) is defined as max vision minus initial vision.
        Input: Pandas dataframe.
        Output: float (LogMAR letters).
        """
        visions = self.vision_list(df)
        return max(visions) - visions[0]

    def proportion_above_baseline(self, df):
        """
        Returns the proportion of time above starting vision.
        Input: patient Pandas dataframe.
        Output: float (percentage).
        """
        lst = self.vision_list(df)
        starting_vision = lst[0]
        above_lst = [i for i in lst if i > starting_vision]
        if len(above_lst) != 0:
            mean = sum(above_lst) / len(above_lst)
        else:
            mean = 0
        return mean

    def patient_clean(self, df, number_years):
        """
        Shortens a patient's dataframe to x years after initiation.
        Input: patient Pandas dataframe, integer.
        Output: Pandas dataframe.
        """
        dates = df['CreatedDate'].to_list()
        first = dates[0]
        cutoff = first.replace(year = first.year + number_years)
        df = df[df['CreatedDate'] < cutoff]
        return df
    
    def baseline_vision(self, df):
        """
        Returns the baseline vision for a patient.
        Input: Pandas dataframe.
        Output: integer (LogMAR letters).
        """
        lst = self.vision_list(df)
        return lst[0]
    
    def init_drug(self, df):
        df.dropna(subset=['InjToday'], inplace=True)
        return df.InjToday.iloc[0]
    
    def location(self, df):
        """
        Returns the location of the clinic.
        """
        df.dropna(subset=['location'], inplace=True)
        return df.location.iloc[0]

In [8]:
class Dataframe(VisualOutcomes):
    
    def get_df(self):
        """
        Returns the dataframe to be analysed (all visits).
        """
        df = pd.read_csv('/home/jupyter/charliemacuject/research_papers/dme/input/df_one_year.csv')
        df.drop(columns=['Unnamed: 0'], inplace=True)
        df.dropna(inplace=True)
        return df
    
    def dataframe_gen(self, df, input_visits):
        """
        Returns a dataframe of all adherence measures and visual outcomes.
        For a singular patient only (will be one row).
        Input: integer (patient id).
        Output: Pandas dataframe.
        """
        ovc = VisualOutcomes.overall_visual_change(self, df)
        df.dropna(subset=['visual_acuity'], inplace=True)
        df = df.head(input_visits)
        data = {'mean_vision': [VisualOutcomes.mean_vision(self, df)], 
                'std_vision': [VisualOutcomes.std_vision(self, df)],
                'peak_visual_improvement': [VisualOutcomes.peak_visual_improvement(self, df)],
                'overall_visual_change': [ovc],
                'baseline': [VisualOutcomes.baseline_vision(self, df)], 
                'location': [VisualOutcomes.location(self, df)],
                'initiation_drug': [VisualOutcomes.init_drug(self, df)]}
        return pd.DataFrame(data)

    
    def master_dataframe(self, input_visits):
        """
        Returns a dataframe of statics for all patients.
        """
        df = self.get_df()
        id_list = df["id"].unique()
        frames = []
        for i in id_list:
            pdf = df[df.id == i]
            if len(pdf) > input_visits:
                ndf = self.dataframe_gen(pdf, input_visits)
                ndf['id'] = i
                frames.append(ndf)
        master = pd.concat(frames)
        master.reset_index(inplace=True)
        master.drop(columns=['index'], inplace=True)
        return master

In [102]:
obj = Dataframe()
df_final = obj.master_dataframe()

In [103]:
df_final.head(10)

Unnamed: 0,mean_vision,std_vision,peak_visual_improvement,overall_visual_change,baseline,location,initiation_drug,id
0,76.0,0.0,0.0,-6.0,76.0,box_hill,Lucentis,0
1,82.0,4.242641,0.0,-11.0,85.0,box_hill,Eylea,2
2,83.333333,2.357023,0.0,-12.666667,85.0,box_hill,Eylea,3
3,94.0,0.0,0.0,0.0,94.0,box_hill,Eylea,6
4,94.0,0.0,0.0,0.0,94.0,box_hill,Eylea,7
5,61.0,0.0,0.0,-0.666667,61.0,box_hill,Eylea,8
6,77.333333,1.885618,4.0,4.333333,76.0,box_hill,Ozurdex,9
7,85.0,0.0,0.0,-1.666667,85.0,box_hill,Avastin,13
8,68.333333,2.357023,5.0,13.666667,65.0,box_hill,Lucentis,14
9,73.333333,4.714045,10.0,2.333333,70.0,box_hill,Lucentis,16


In [104]:
def label_ovc(row): return 1 if row['overall_visual_change'] > 0 else 0
df_final['outcome'] = df_final.apply(lambda row: label_ovc(row), axis=1)

In [105]:
df_final = df_final.sample(frac=1).reset_index(drop=True)
df_final.head()

Unnamed: 0,mean_vision,std_vision,peak_visual_improvement,overall_visual_change,baseline,location,initiation_drug,id,outcome
0,80.0,0.0,0.0,5.0,80.0,boronia,Eylea,620,1
1,78.666667,1.885618,0.0,-13.0,80.0,boronia,Eylea,583,0
2,83.333333,2.357023,0.0,-1.666667,85.0,boronia,Lucentis,657,0
3,78.666667,1.885618,0.0,-11.333333,80.0,box_hill,Lucentis,45,0
4,74.0,2.828427,0.0,-19.0,76.0,boronia,Eylea,668,0


In [106]:
len(df_final)

183

In [107]:
df_train = df_final.head(140)
df_test = df_final.tail(29)

In [108]:
df_train.drop(columns=['overall_visual_change'], inplace=True)
df_test.drop(columns=['overall_visual_change'], inplace=True)

df_train.to_csv('/home/jupyter/charliemacuject/research_papers/dme/input/df_train_one.csv')
df_test.to_csv('/home/jupyter/charliemacuject/research_papers/dme/input/df_test_one.csv')

## Put it all in one function

In [9]:
def year_cutoff(df, year, visits):
    id_list = df.id.unique()
    frames = []
    for eye in id_list:
        pdf = df[df.id == eye]
        pdf.CreatedDate = pd.to_datetime(pdf.CreatedDate)
        start = pd.to_datetime(pdf.CreatedDate.iloc[0])
        cutoff = start + pd.offsets.DateOffset(years=year)
        if pd.to_datetime(pdf.CreatedDate.iloc[-1]) >= cutoff:
            pdf = pdf[pdf.CreatedDate <= cutoff]
            if len(pdf) >= visits:
                frames.append(pdf)
    return pd.concat(frames)

In [17]:
def generate_data(input_visits, visits, year): 
    df = pd.read_csv('/home/jupyter/charliemacuject/research_papers/dme/input/df.csv')
    df_new = year_cutoff(df, year, visits)
    df_new.to_csv('/home/jupyter/charliemacuject/research_papers/dme/input/df_one_year.csv')
    obj = Dataframe()
    df_final = obj.master_dataframe(input_visits)
    def label_ovc(row): return 1 if row['overall_visual_change'] > 0 else 0
    df_final['outcome'] = df_final.apply(lambda row: label_ovc(row), axis=1)
    df_final = df_final.sample(frac=1).reset_index(drop=True)
    df_train = df_final.head(round(len(df_final)*0.72))
    df_test = df_final.tail(round(len(df_final)*0.28))
    df_train.drop(columns=['overall_visual_change'], inplace=True)
    df_test.drop(columns=['overall_visual_change'], inplace=True)
    df_train.to_csv('/home/jupyter/charliemacuject/research_papers/dme/input/df_train_one.csv')
    df_test.to_csv('/home/jupyter/charliemacuject/research_papers/dme/input/df_test_one.csv')


In [26]:
generate_data(4, 6, 3)