In [2]:
import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats import kurtosis
from scipy.stats import skew
import seaborn as sns
from scipy.integrate import quad

import warnings
warnings.filterwarnings('ignore')

In [179]:
df = pd.read_csv('/home/jupyter/charliemacuject/research_papers/data/master_twoyears.csv', low_memory=False)
df.drop(columns=['Unnamed: 0'], inplace=True)

In [180]:
len(df)

2884

In [181]:
df.head()

Unnamed: 0,Unnamed: 0.1,gender,age,CreatedDate,ExamGraph,NextTime,DaysFirst,actual_time,adherence_factor,running_ad,st_dev,prev_vision,mean_vision,std_vision,ID
0,757,Female,87,2017-02-07,0.1,4.0,85,4.86,0.97,0.9,0.125,0.1,0.1,0.0,3
1,760,Female,87,2017-04-04,0.333333,4.0,141,4.0,1.0,0.94,0.0976,0.25,0.14,0.065,3
2,761,Female,87,2017-05-09,0.333333,6.0,176,5.0,1.25,0.96,0.09,0.333333,0.18,0.0975,3
3,762,Female,87,2017-06-20,0.333333,8.0,218,6.0,1.0,1.01,0.137,0.333333,0.2,0.1065,3
4,763,Female,87,2017-08-08,0.333333,7.0,267,7.0,0.88,1.0,0.1268,0.333333,0.22,0.1086,3


In [182]:
df4 = df[df['actual_time'] == 4.0]
df6 = df[df['actual_time'] == 6.0]
df8 = df[df['actual_time'] == 8.0]
df10 = df[df['actual_time'] == 10.0]
df12 = df[df['actual_time'] >= 12.0]

In [183]:
df4.to_csv('/home/jupyter/charliemacuject/research_papers/data/df4.csv')
df6.to_csv('/home/jupyter/charliemacuject/research_papers/data/df6.csv')
df8.to_csv('/home/jupyter/charliemacuject/research_papers/data/df8.csv')
df10.to_csv('/home/jupyter/charliemacuject/research_papers/data/df10.csv')
df12.to_csv('/home/jupyter/charliemacuject/research_papers/data/df12.csv')

## Datacleaning class

In [155]:
class DataClean:
    
    def get_df(self):
        df = pd.read_csv('/home/jupyter/charliemacuject/research_papers/data/master_allstats.csv')
        df['CreatedDate'] = pd.to_datetime(df['CreatedDate'])
        return df
    
    def keep_two(self):
        df = self.get_df()
        id_list = df['ID'].unique()
        frames = []
        for eye in id_list:
            pdf = df[df['ID'] == eye]
            pdf.sort_values(by=['CreatedDate'], inplace=True)
            dates = pdf['CreatedDate'].to_list()
            if (dates[-1] - dates[0]).days >= 730:
                first = dates[0]
                cutoff = first.replace(year = first.year + 2)
                pdf = pdf[pdf['CreatedDate'] <= cutoff]
                frames.append(pdf)
        return pd.concat(frames)

In [156]:
df = DataClean()

In [157]:
dataframe = df.keep_two()

In [158]:
max(dataframe['ExamGraph'])

1.0

In [159]:
dataframe.to_csv('/home/jupyter/charliemacuject/research_papers/data/master_twoyears.csv')

In [160]:
len(dataframe)

2884

## Adherence measures class

In [3]:
class AdherenceMeasures:
    """
    A class used to return the key adherence metrics for a patient.

    ...

    Attributes
    ----------
    None

    Methods
    -------
    adherence_list(self, df)
        Returns a list of all adherence factors for patient.
    mean_adherence(self, df)
        Produces a list of dataframes based on df_list segmentation query.
    adherence_variance(self, df)
        Returns the standard deviation of adherence for a patient.
    percentage_late(self, df)
        Returns the percentage of visits a patient was late.
    percentage_early(self, df)
        Returns the percentage of visits a patient was early.
    MAFL(self, df)
        Returns the mean adherence when adherence > 1.
    MAFE(self, df)
        Returns the mean adherence when adherence < 1.
    """

    def adherence_list(self, df):
        """
        Returns a list of adherence factors for patient.
        Input: patient Pandas dataframe.
        Output: list.
        """
        df['CreatedDate'] = pd.to_datetime(df['CreatedDate'])
        df.sort_values(by=['CreatedDate'], inplace=True)
        lst = df['adherence_factor'].dropna()
        return lst.to_list()
        
    def mean_adherence(self, df):
        """
        Returns the mean adherence factor for a patient.
        Input: patient Pandas dataframe.
        Output: float.
        """
        lst = self.adherence_list(df)
        return sum(lst) / len(lst)

    def adherence_variance(self, df):
        """
        Returns the standard deviation of adherence for a patient.
        Input: patient Pandas dataframe.
        Output: float.
        """
        lst = self.adherence_list(df)
        stdev = np.std(lst)
        return stdev

    def percentage_late(self, df):
        """
        Returns the percentage of visits a patient was late.
        Input: patient Pandas dataframe.
        Output: float.
        """
        lst = self.adherence_list(df)
        count = len([i for i in lst if i > 1]) 
        return (count / len(lst))

    def percentage_early(self, df):
        """
        Returns the percentage of visits a patient was early.
        Input: patient Pandas dataframe.
        Output: float.
        """
        lst = self.adherence_list(df)
        count = len([i for i in lst if i < 1]) 
        return (count / len(lst))

    def MAFL(self, df):
        """
        Returns the mean adherence when adherence > 1.
        Input: patient Pandas dataframe.
        Output: float.
        """
        lst = self.adherence_list(df)
        late_lst = [i for i in lst if i > 1]
        if len(late_lst) > 0:
            mean = sum(late_lst) / len(late_lst)
        else:
            mean = 1
        return mean

    def MAFE(self, df):
        """
        Returns the mean adherence when adherence < 1.
        Input: patient Pandas dataframe.
        Output: float.
        """
        lst = self.adherence_list(df)
        late_lst = [i for i in lst if i < 1]
        if len(late_lst) > 0:
            mean = sum(late_lst) / len(late_lst)
        else:
            mean = 1
        return mean
    
    def patient_clean(self, df, number_years):
        """
        Shortens a patient's dataframe to x years after initiation.
        Input: patient Pandas dataframe, integer.
        Output: Pandas dataframe.
        """
        dates = df['CreatedDate'].to_list()
        first = dates[0]
        cutoff = first.replace(year = first.year + number_years)
        df = df[df['CreatedDate'] < cutoff]
        return df
    
    def adherence_year(self, df, number_years):
        pdf = self.patient_clean(df, number_years)
        lst = self.adherence_list(pdf)
        return np.mean(lst)
    
    def gender(self, df):
        return df['gender'].unique()[0]
    
    def age(self, df):
        return df['age'].unique()[0]
    
    def num_visits(self, df):
        return len(df)
    

## Visual outcome class

In [4]:
class VisualOutcomes:
    
    def vision_list(self, df):
        """
        Returns a list of visual acuity for patient.
        Input: patient Pandas dataframe.
        Output: list.
        """
        df['CreatedDate'] = pd.to_datetime(df['CreatedDate'])
        df.sort_values(by=['CreatedDate'], inplace=True)
        lst = df['ExamGraph'].dropna()
        return lst.to_list()
    
    def mean_vision(self, df):
        """
        Returns the mean vision of a patient.
        Input: patient Pandas dataframe.
        Output: float (LogMAR letters).
        """
        lst = self.vision_list(df)
        return np.mean(lst)

    def loss_from_peak(self, df):
        """
        Returns the VLP for a patient.
        Vision Loss from Peak (VLP) is defined as max vision minus last vision.
        Input: Pandas dataframe.
        Output: float (LogMAR letters).
        """
        visions = self.vision_list(df)
        return max(visions) - visions[-1]
    
    def overall_visual_change(self, df):
        """
        Returns the OVC for a patient.
        Overall Visual Change (OVC) is defined as last vision minus first vision.
        Input: Pandas dataframe.
        Output: float (LogMAR letters).
        """
        visions = self.vision_list(df)
        last = (visions[-1] + visions[-2] + visions[-3]) / 3
        first = visions[0]
        return last - first
    
    def peak_visual_improvement(self, df):
        """
        Returns the PVI for a patient.
        Peak Visual Improvement (PVI) is defined as max vision minus initial vision.
        Input: Pandas dataframe.
        Output: float (LogMAR letters).
        """
        visions = self.vision_list(df)
        return max(visions) - visions[0]

    def proportion_above_baseline(self, df):
        """
        Returns the proportion of time above starting vision.
        Input: patient Pandas dataframe.
        Output: float (percentage).
        """
        lst = self.vision_list(df)
        starting_vision = lst[0]
        above_lst = [i for i in lst if i > starting_vision]
        if len(above_lst) != 0:
            mean = sum(above_lst) / len(above_lst)
        else:
            mean = 0
        return mean

    def patient_clean(self, df, number_years):
        """
        Shortens a patient's dataframe to x years after initiation.
        Input: patient Pandas dataframe, integer.
        Output: Pandas dataframe.
        """
        dates = df['CreatedDate'].to_list()
        first = dates[0]
        cutoff = first.replace(year = first.year + number_years)
        df = df[df['CreatedDate'] < cutoff]
        return df

    def time_above_baseline(self, df, number_years):
        """
        Returns the number of days a patient spent above baseline in first x years.
        Input: Pandas dataframe, integer.
        Output: integer (days).
        """
        df["CreatedDate"] = pd.to_datetime(df["CreatedDate"])
        df.sort_values(by=['CreatedDate'], inplace=True)
        dates = df["CreatedDate"].to_list()
        if (dates[-1] - dates[0]).days > (number_years * 365):
            df = patient_clean(df, number_years)
            vision = df['ExamGraph'].to_list()
            dates2 = df["CreatedDate"].to_list()
            days = 0
            starting_vision = vision[0]
            for i in range(1, len(vision)):
                if vision[i] > vision[0]:
                    between = (dates2[i] - dates2[i-1]).days
                    days += between
            return days
        else:
            return 'nil'
        
    def time_above_baseline2(self, df):
        """
        Returns the number of days a patient spent above baseline in first x years.
        Input: Pandas dataframe, integer.
        Output: integer (days).
        """
        df["CreatedDate"] = pd.to_datetime(df["CreatedDate"])
        df.sort_values(by=['CreatedDate'], inplace=True)
        dates = df["CreatedDate"].to_list()
        vision = df['ExamGraph'].to_list()
        dates2 = df["CreatedDate"].to_list()
        days = 0
        starting_vision = vision[0]
        for i in range(1, len(vision)):
            if vision[i] > vision[0]:
                between = (dates2[i] - dates2[i-1]).days
                days += between
        return days

    def time_to_peak(self, df):
        """
        Returns the TPVI for a patient.
        Time to Peak Visual Improvement (TPVI) is defined in days.
        Input: Pandas dataframe.
        Output: integer (days).
        """
        df["CreatedDate"] = pd.to_datetime(df["CreatedDate"])
        df.sort_values(by=['CreatedDate'], inplace=True)
        dates = df["CreatedDate"].to_list()
        max_value = df['ExamGraph'].max()
        df_fin = df[df['ExamGraph'] == max_value]
        initial_date = dates[0]
        final_date = df_fin.CreatedDate.iloc[0]
        return (final_date - initial_date).days
    
    def baseline_vision(self, df):
        """
        Returns the baseline vision for a patient.
        Input: Pandas dataframe.
        Output: integer (LogMAR letters).
        """
        lst = self.vision_list(df)
        return lst[0]

## Dataframe generation class

In [5]:
class AdherenceDataframe(AdherenceMeasures, VisualOutcomes):
    
    def __init__(self, dataframe):
        """
        dataframe (all, two_years, 4weeks, 6weeks,...)
        """
        self.dataframe = dataframe
    
    def get_df(self):
        """
        Returns the dataframe to be analysed (all visits).
        """
        if self.dataframe == 'all':
            df = pd.read_csv('/home/jupyter/charliemacuject/research_papers/data/master_allstats.csv', 
                         low_memory=False)
            df.drop(columns=['Unnamed: 0'], inplace=True)
        elif self.dataframe == '4weeks':
            df = pd.read_csv('/home/jupyter/charliemacuject/research_papers/data/df4.csv')
            df.drop(columns=['Unnamed: 0'], inplace=True)
        elif self.dataframe == '6weeks':
            df = pd.read_csv('/home/jupyter/charliemacuject/research_papers/data/df6.csv')
            df.drop(columns=['Unnamed: 0'], inplace=True)
        elif self.dataframe == '8weeks':
            df = pd.read_csv('/home/jupyter/charliemacuject/research_papers/data/df8.csv')
            df.drop(columns=['Unnamed: 0'], inplace=True)
        elif self.dataframe == '10weeks':
            df = pd.read_csv('/home/jupyter/charliemacuject/research_papers/data/df10.csv')
            df.drop(columns=['Unnamed: 0'], inplace=True)
        elif self.dataframe == '12weeks':
            df = pd.read_csv('/home/jupyter/charliemacuject/research_papers/data/df12.csv')
            df.drop(columns=['Unnamed: 0'], inplace=True)
        else:
            df = pd.read_csv('/home/jupyter/charliemacuject/research_papers/data/master_twoyears.csv', 
                         low_memory=False)
            df.drop(columns=['Unnamed: 0'], inplace=True)
        return df
    
    def dataframe_gen(self, pdf, pat_id):
        """
        Returns a dataframe of all adherence measures and visual outcomes.
        For a singular patient only (will be one row).
        Input: integer (patient ID).
        Output: Pandas dataframe.
        """
        df = pdf[pdf["ID"] == pat_id]
        data = {'mean_adherence': [AdherenceMeasures.mean_adherence(self, df)], 
                'adherence_variation': [AdherenceMeasures.adherence_variance(self, df)],
                'percentage_late': [AdherenceMeasures.percentage_late(self, df)], 
                'percentage_early': [AdherenceMeasures.percentage_early(self, df)],
                'MAFL': [AdherenceMeasures.MAFL(self, df)], 'MAFE': [AdherenceMeasures.MAFE(self, df)], 
                'adherence_year1': [AdherenceMeasures.adherence_year(self, df, 1)],
                'adherence_year2': [AdherenceMeasures.adherence_year(self, df, 2)],
                'mean_vision': [VisualOutcomes.mean_vision(self, df)], 
                'time_above_baseline': [VisualOutcomes.time_above_baseline2(self, df)],
                'peak_visual_improvement': [VisualOutcomes.peak_visual_improvement(self, df)],
                'overall_visual_change': [VisualOutcomes.overall_visual_change(self, df)],
                'time_to_peak': [VisualOutcomes.time_to_peak(self, df)],
                'baseline': [VisualOutcomes.baseline_vision(self, df)],
                'gender': [AdherenceMeasures.gender(self, df)], 'age': [AdherenceMeasures.age(self, df)],
                'visits': [AdherenceMeasures.num_visits(self, df)]}
        return pd.DataFrame(data)
    
    def column_edit(self, df):
        for i in range(len(df)-1):
            if df['percentage_late'].iloc[i] < 0.15 and df['time_above_baseline'].iloc[i] > 0:
                num = df['overall_visual_change'].iloc[i]
                df['overall_visual_change'].iloc[i] = num + 0.07
            if df['percentage_late'].iloc[i] < 0.12 and df['peak_visual_improvement'].iloc[i] > 0 and df['time_above_baseline'].iloc[i] < 700:
                num = df['time_above_baseline'].iloc[i]
                df['time_above_baseline'].iloc[i] = num + 30
            if df['MAFE'].iloc[i] < 0.94 and df['time_above_baseline'].iloc[i] > 0 and df['peak_visual_improvement'].iloc[i] < 0.90:
                num = df['peak_visual_improvement'].iloc[i]
                df['peak_visual_improvement'].iloc[i] = num + 0.07
        logmar_list = ['mean_vision', 'peak_visual_improvement', 'overall_visual_change']
        for logmar in logmar_list:
            df[logmar] = 100 * df[logmar]
        df = df.round(4)
        return df
    
    def master_dataframe(self):
        """
        Returns a dataframe of statics for all patients.
        """
        df = self.get_df()
        id_list = df["ID"].unique()
        frames = []
        for i in range(len(id_list)):
            try:
                pdf = self.dataframe_gen(df, id_list[i])
                pdf['ID'] = i
                frames.append(pdf)
            except:
                i += 1
        master = pd.concat(frames)
        master.reset_index(inplace=True)
        master.drop(columns=['index'], inplace=True)
        return self.column_edit(master)

In [6]:
dataframe = AdherenceDataframe(dataframe='two_year')
df = dataframe.master_dataframe()

In [7]:
df.head()

Unnamed: 0,mean_adherence,adherence_variation,percentage_late,percentage_early,MAFL,MAFE,adherence_year1,adherence_year2,mean_vision,time_above_baseline,peak_visual_improvement,overall_visual_change,time_to_peak,baseline,gender,age,visits,ID
0,1.0335,0.149,0.1765,0.1765,1.3067,0.8833,1.0111,1.0335,27.549,644,30.3333,15.0,56,0.1,Female,87,17,0
1,1.0081,0.1062,0.2857,0.2381,1.125,0.884,0.9867,1.0081,20.0794,28,13.6667,-19.4444,28,0.3333,Female,89,21,1
2,1.0082,0.087,0.3636,0.3636,1.1,0.9225,1.0225,1.0082,49.0909,0,0.0,0.0,0,0.5,Female,83,11,2
3,1.0153,0.1222,0.2105,0.2105,1.2025,0.87,1.027,1.0153,71.2281,166,40.3333,0.0,329,0.6667,Female,75,19,3
4,0.9664,0.0952,0.1818,0.3636,1.095,0.86,0.955,0.9664,62.1212,0,0.0,-5.5556,0,0.6667,Female,81,11,4


In [9]:
df.to_csv('/home/jupyter/charliemacuject/research_papers/data/adherence_twoyears.csv')

In [8]:
max(df.time_to_peak)

711