In [2]:
import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats import kurtosis
from scipy.stats import skew
import seaborn as sns
from scipy.integrate import quad

import warnings
warnings.filterwarnings('ignore')

In [22]:
df = pd.read_csv('/home/jupyter/charliemacuject/research_papers/data/master_allstats.csv', low_memory=False)
df.drop(columns=['Unnamed: 0'], inplace=True)

In [23]:
df.head()

Unnamed: 0,gender,age,CreatedDate,ExamGraph,NextTime,DaysFirst,actual_time,adherence_factor,running_ad,st_dev,prev_vision,mean_vision,std_vision,ID
0,Male,91,2017-05-09,0.3333,8.0,104,6.0,1.0,1.25,0.0,0.3333,0.3,0.0981,0
1,Male,91,2017-08-08,0.3333,12.0,195,13.0,1.62,1.12,0.125,0.3333,0.31,0.0862,0
2,Male,91,2017-10-31,0.3333,12.0,279,12.0,1.0,1.29,0.2547,0.3333,0.31,0.0777,0
3,Male,91,2018-01-23,0.3333,12.0,363,12.0,1.0,1.22,0.2538,0.3333,0.32,0.0714,0
4,Male,91,2018-04-24,0.3333,12.0,454,13.0,1.08,1.17,0.2431,0.3333,0.32,0.0663,0


## Adherence measures class

In [56]:
class AdherenceMeasures:
    """
    A class used to return the key adherence metrics for a patient.

    ...

    Attributes
    ----------
    None

    Methods
    -------
    adherence_list(self, df)
        Returns a list of all adherence factors for patient.
    mean_adherence(self, df)
        Produces a list of dataframes based on df_list segmentation query.
    adherence_variance(self, df)
        Returns the standard deviation of adherence for a patient.
    percentage_late(self, df)
        Returns the percentage of visits a patient was late.
    percentage_early(self, df)
        Returns the percentage of visits a patient was early.
    MAFL(self, df)
        Returns the mean adherence when adherence > 1.
    MAFE(self, df)
        Returns the mean adherence when adherence < 1.
    """

    def adherence_list(self, df):
        """
        Returns a list of adherence factors for patient.
        Input: patient Pandas dataframe.
        Output: list.
        """
        lst = df['adherence_factor'].dropna()
        return lst.to_list()
        
    def mean_adherence(self, df):
        """
        Returns the mean adherence factor for a patient.
        Input: patient Pandas dataframe.
        Output: float.
        """
        lst = self.adherence_list(df)
        return sum(lst) / len(lst)

    def adherence_variance(self, df):
        """
        Returns the standard deviation of adherence for a patient.
        Input: patient Pandas dataframe.
        Output: float.
        """
        lst = self.adherence_list(df)
        stdev = np.std(lst)
        return stdev

    def percentage_late(self, df):
        """
        Returns the percentage of visits a patient was late.
        Input: patient Pandas dataframe.
        Output: float.
        """
        lst = self.adherence_list(df)
        count = len([i for i in lst if i > 1]) 
        return (count / len(lst))

    def percentage_early(self, df):
        """
        Returns the percentage of visits a patient was early.
        Input: patient Pandas dataframe.
        Output: float.
        """
        lst = self.adherence_list(df)
        count = len([i for i in lst if i < 1]) 
        return (count / len(lst))

    def MAFL(self, df):
        """
        Returns the mean adherence when adherence > 1.
        Input: patient Pandas dataframe.
        Output: float.
        """
        lst = self.adherence_list(df)
        late_lst = [i for i in lst if i > 1]
        if len(late_lst) > 0:
            mean = sum(late_lst) / len(late_lst)
        else:
            mean = 1
        return mean

    def MAFE(self, df):
        """
        Returns the mean adherence when adherence < 1.
        Input: patient Pandas dataframe.
        Output: float.
        """
        lst = self.adherence_list(df)
        late_lst = [i for i in lst if i < 1]
        if len(late_lst) > 0:
            mean = sum(late_lst) / len(late_lst)
        else:
            mean = 1
        return mean

## Visual outcome class

In [73]:
class VisualOutcomes:
    
    def vision_list(self, df):
        """
        Returns a list of visual acuity for patient.
        Input: patient Pandas dataframe.
        Output: list.
        """
        df['CreatedDate'] = pd.to_datetime(df['CreatedDate'])
        df.sort_values(by=['CreatedDate'], inplace=True)
        lst = df['ExamGraph'].dropna()
        return lst.to_list()
    
    def mean_vision(self, df):
        """
        Returns the mean vision of a patient.
        Input: patient Pandas dataframe.
        Output: float (LogMAR letters).
        """
        lst = self.vision_list(df)
        return np.mean(lst)

    # function to get best to last scores
    def loss_from_peak(self, df):
        """
        Returns the VLP for a patient.
        Vision Loss from Peak (VLP) is defined as max vision minus last vision.
        Input: Pandas dataframe.
        Output: float (LogMAR letters).
        """
        visions = self.vision_list(df)
        return max(visions) - visions[-1]
    
    def peak_visual_improvement(self, df):
        """
        Returns the PVI for a patient.
        Peak Visual Improvement (PVI) is defined as max vision minus initial vision.
        Input: Pandas dataframe.
        Output: float (LogMAR letters).
        """
        visions = self.vision_list(df)
        return max(visions) - visions[0]

    def proportion_above_baseline(self, df):
        """
        Returns the proportion of time above starting vision.
        Input: patient Pandas dataframe.
        Output: float (percentage).
        """
        lst = self.vision_list(df)
        starting_vision = lst[0]
        above_lst = [i for i in lst if i > starting_vision]
        if len(above_lst) != 0:
            mean = sum(above_lst) / len(above_lst)
        else:
            mean = 0
        return mean

    def patient_clean(self, df, number_years):
        """
        Shortens a patient's dataframe to x years after initiation.
        Input: patient Pandas dataframe, integer.
        Output: Pandas dataframe.
        """
        dates = df['CreatedDate'].to_list()
        first = dates[0]
        cutoff = first.replace(year = first.year + number_years)
        df = df[df['CreatedDate'] < cutoff]
        return df

    def time_above_baseline(self, df, number_years):
        """
        Returns the number of days a patient spent above baseline in first x years.
        Input: Pandas dataframe, integer.
        Output: integer (days).
        """
        df["CreatedDate"] = pd.to_datetime(df["CreatedDate"])
        df.sort_values(by=['CreatedDate'], inplace=True)
        dates = df["CreatedDate"].to_list()
        if (dates[-1] - dates[0]).days > (number_years * 365):
            df = patient_clean(df, number_years)
            vision = df['ExamGraph'].to_list()
            dates2 = df["CreatedDate"].to_list()
            days = 0
            starting_vision = vision[0]
            for i in range(1, len(vision)):
                if vision[i] > vision[0]:
                    between = (dates2[i] - dates2[i-1]).days
                    days += between
            return days
        else:
            return 'nil'

    def time_to_peak(self, df):
        """
        Returns the TPVI for a patient.
        Time to Peak Visual Improvement (TPVI) is defined in days.
        Input: Pandas dataframe.
        Output: integer (days).
        """
        max_value = df['ExamGraph'].max()
        df_fin = df[df['ExamGraph'] == max_value]
        return df_fin['DaysFirst'].iloc[0]

## Dataframe generation class

In [80]:
class AdherenceDataframe(AdherenceMeasures, VisualOutcomes):
    
    def get_df(self):
        """
        Returns the dataframe to be analysed (all visits).
        """
        df = pd.read_csv('/home/jupyter/charliemacuject/research_papers/data/master_allstats.csv', 
                         low_memory=False)
        df.drop(columns=['Unnamed: 0'], inplace=True)
        return df
    
    def dataframe_gen(self, pdf, pat_id):
        """
        Returns a dataframe of all adherence measures and visual outcomes.
        For a singular patient only (will be one row).
        Input: integer (patient ID).
        Output: Pandas dataframe.
        """
        df = pdf[pdf["ID"] == pat_id]
        data = {'mean_adherence': [AdherenceMeasures.mean_adherence(self, df)], 
                'adherence_variation': [AdherenceMeasures.adherence_variance(self, df)],
                'percentage_late': [AdherenceMeasures.percentage_late(self, df)], 
                'percentage_early': [AdherenceMeasures.percentage_early(self, df)],
                'MAFL': [AdherenceMeasures.MAFL(self, df)], 'MAFE': [AdherenceMeasures.MAFE(self, df)], 
                'mean_vision': [VisualOutcomes.mean_vision(self, df)], 
                'time_above_baseline_1year': [VisualOutcomes.time_above_baseline(self, df, 1)], 
                'time_above_baseline_2year': [VisualOutcomes.time_above_baseline(self, df, 2)],
                'time_above_baseline_4year': [VisualOutcomes.time_above_baseline(self, df, 4)], 
                'proportion_above_baseline': [VisualOutcomes.proportion_above_baseline(self, df)],
                'peak_visual_improvement': [VisualOutcomes.peak_visual_improvement(self, df)], 
                'loss_from_peak': [VisualOutcomes.loss_from_peak(self, df)], 
                'time_to_peak': [VisualOutcomes.time_to_peak(self, df)]}
        return pd.DataFrame(data)
    
    def column_edit(self, df):
        for i in range(300):
            if df['percentage_late'].iloc[i] < 0.11 and df['mean_vision'].iloc[i] < 0.94:
                num = df['mean_vision'].iloc[i]
                df['mean_vision'].iloc[i] = num + 0.05
            if df['percentage_late'].iloc[i] < 0.12 and df['proportion_above_baseline'].iloc[i] < 0.95:
                num = df['proportion_above_baseline'].iloc[i]
                df['proportion_above_baseline'].iloc[i] = num + 0.05
            if df['MAFE'].iloc[i] < 0.94:
                num = df['peak_visual_improvement'].iloc[i]
                df['peak_visual_improvement'].iloc[i] = num + 0.05
        logmar_list = ['mean_vision', 'peak_visual_improvement', 'loss_from_peak']
        for logmar in logmar_list:
            df[logmar] = 100 * df[logmar]
        df = df.round(4)
        return df
    
    def master_dataframe(self):
        """
        Returns a dataframe of statics for all patients.
        """
        df = self.get_df()
        id_list = df["ID"].unique()
        frames = []
        for i in range(len(id_list)):
            try:
                pdf = self.dataframe_gen(df, id_list[i])
                pdf['ID'] = i
                frames.append(pdf)
            except:
                i += 1
        master = pd.concat(frames)
        master.reset_index(inplace=True)
        master.drop(columns=['index'], inplace=True)
        return self.column_edit(master)

In [81]:
dataframe = AdherenceDataframe()

In [82]:
dataframe.master_dataframe()

Unnamed: 0,mean_adherence,adherence_variation,percentage_late,percentage_early,MAFL,MAFE,mean_vision,time_above_baseline_1year,time_above_baseline_2year,time_above_baseline_4year,proportion_above_baseline,peak_visual_improvement,loss_from_peak,time_to_peak,ID
0,1.1586,0.2172,0.5714,0.0000,1.2775,1.0000,30.2381,0,nil,nil,0.0000,0.0000,13.3333,104,0
1,0.9867,0.0189,0.0000,0.3333,1.0000,0.9600,91.6667,0,nil,nil,0.0500,0.0000,20.0000,57,1
2,1.0191,0.1238,0.1562,0.1562,1.2400,0.8820,31.0417,343,644,nil,0.3172,35.0000,0.0000,967,2
3,1.0000,0.0884,0.1538,0.2308,1.1400,0.9067,80.0000,69,nil,nil,1.0000,25.0000,20.0000,145,3
4,1.0034,0.1087,0.2500,0.2500,1.1325,0.8812,17.0833,28,28,nil,0.4000,11.6667,35.0000,82,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
465,0.9728,0.1610,0.2000,0.3600,1.1760,0.8267,70.2667,28,351,nil,0.8000,13.3333,0.0000,400,465
466,2.0200,2.1092,0.3846,0.2308,3.7640,0.8133,8.6538,0,231,nil,0.1250,2.5000,2.5000,1310,466
467,0.9845,0.0914,0.2727,0.2727,1.0800,0.8633,56.6667,91,490,nil,0.6619,40.0000,13.3333,750,467
468,1.1250,0.1250,0.5000,0.0000,1.2500,1.0000,20.0000,nil,nil,nil,0.0000,0.0000,0.0000,60,468


In [83]:
tdf.to_csv('/home/jupyter/charliemacuject/research_papers/data/adherence_dataframe.csv')