# What are we measuring in the initiation stage?
* Best minus first → HIGHER is better
* AUC → HIGHER is better
* Above start → HIGHER is better
* Best point → DEPENDS (look at how best point relates to other visual outcomes, especially visual outcomes over whole patient journey)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from scipy import stats
import seaborn as sns
from scipy.integrate import quad
from scipy.stats import norm
import statsmodels.api as sm
import statsmodels.stats.api as sms

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('/home/jupyter/macuject_automated/data/initiation_df.csv')
df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], inplace=True)

In [3]:
df

Unnamed: 0,gender,age,ExamGraph,InjectionType,NextTime,DaysFirst,actual_time,adherence_factor,running_ad,st_dev,ad_five,prev_vision,mean_vision,last_five,last_ten,last_three,std_vision,std_five,ID
0,Female,100,0.166667,Lucentis,13.0,668,12.0,1.0,0.67,0.4714,0.67,0.166667,0.19,0.16,0.15,0.18,0.0651,0.0416,0
1,Female,100,0.100000,Lucentis,4.0,843,12.0,1.0,0.62,0.4621,0.53,0.166667,0.19,0.17,0.15,0.17,0.0607,0.0000,0
2,Female,100,0.166667,Lucentis,4.0,871,4.0,1.0,0.69,0.4445,0.53,0.100000,0.18,0.15,0.15,0.14,0.0622,0.0267,0
3,Female,100,0.166667,Lucentis,4.0,899,4.0,1.0,0.73,0.4259,0.78,0.166667,0.18,0.15,0.15,0.14,0.0609,0.0267,0
4,Female,100,0.166667,Lucentis,4.0,927,4.0,1.0,0.76,0.4081,0.78,0.166667,0.18,0.15,0.16,0.14,0.0597,0.0267,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16531,Male,76,1.500000,Lucentis,6.0,59,4.0,1.0,1.11,0.0000,1.11,1.200000,0.93,0.93,0.93,0.93,0.2667,0.2667,1115
16532,Male,76,1.000000,Lucentis,6.0,157,8.0,1.0,1.04,0.0519,1.04,1.500000,1.22,1.22,1.22,1.40,0.3403,0.3403,1115
16533,Male,76,1.000000,Lucentis,6.0,199,6.0,1.0,1.03,0.0476,1.03,1.000000,1.17,1.30,1.17,1.33,0.3165,0.2121,1115
16534,Male,76,1.200000,Lucentis,8.0,339,8.0,1.0,0.97,0.1277,0.92,1.200000,1.20,1.18,1.20,1.35,0.2816,0.2046,1115


## Rework dataframe to get what we need

### Adherence measures

In [4]:
# mean adherence
def mean_adherence(df):
    lst = df['adherence_factor'].to_list()
    mean = sum(lst) / len(lst)
    return mean

# standard deviation of adherence
def stdev_ad(df):
    lst = df['adherence_factor'].to_list()
    stdev = np.std(lst)
    return stdev

# frequency of lateness
def freq_late(df):
    lst = df['adherence_factor'].to_list()
    count = len([i for i in lst if i > 1]) 
    prop = count / len(lst)
    return prop

# extremity of lateness
def extreme_late(df):
    lst = df['adherence_factor'].to_list()
    late_lst = [i for i in lst if i > 1]
    if len(late_lst) > 0:
        mean = sum(late_lst) / len(late_lst)
    else:
        mean = 1
    return mean

# extremity of earliness
def extreme_early(df):
    lst = df['adherence_factor'].to_list()
    late_lst = [i for i in lst if i < 1]
    if len(late_lst) > 0:
        mean = sum(late_lst) / len(late_lst)
    else:
        mean = 1
    return mean

# mean patient interval length for late visits
def int_length_late(df):
    df_late = df[df['adherence_factor'] > 1]
    lst = df_late['actual_time']
    if len(lst) > 0:
        mean = sum(lst) / len(lst)
    else:
        mean = sum(df['actual_time']) / len(df)
    return mean

# mean patient interval length for early visits
def int_length_early(df):
    df_late = df[df['adherence_factor'] < 1]
    lst = df_late['actual_time']
    if len(lst) > 0:
        mean = sum(lst) / len(lst)
    else:
        mean = sum(df['actual_time']) / len(df)
    return mean

### Visual outcomes

In [5]:
# function to get best to first scores
def best_first(df):
    lst = df['ExamGraph'].tolist()
    max_value = max(lst)
    first_value = lst[0]
    difference = max_value - first_value
    return difference

# proportion of time spent above starting vision
def above_start(df):
    lst = df['ExamGraph'].to_list()
    count = 0
    starting_vision = lst[0]
    for i in range(1, len(lst)):
        if lst[i] > starting_vision:
            count += 1
    prop = count / len(lst)
    return np.round(prop, 2)

# time to best
def best_point(df):
    max_value = df['ExamGraph'].max()
    df_fin = df[df['ExamGraph'] == max_value]
    best_point = df_fin['DaysFirst'].iloc[0]
    last_point = df['DaysFirst'].iloc[-1]
    return np.round(best_point / last_point, 2)

# area under the curve function
class Polynomial:
    
    def __init__(self, coeff_list):
        """ input: coefficients are in the form a_n, ...a_1, a_0 
        """
        self.coefficients = coeff_list # tuple is turned into a list
    
    # The __repr__ and __str__ method can be included here,
    # but is not necessary for the immediately following code
    
    def __repr__(self):
        """
        method to return the canonical string representation 
        of a polynomial.
        """
        return "Polynomial" + str(tuple(self.coefficients))
            
    def __call__(self, x):    
        res = 0
        for coeff in self.coefficients:
            res = res * x + coeff
        return res 
    
    def degree(self):
        return len(self.coefficients) 
    
def AUC_vision(df):
    time = np.array(df['DaysFirst'])
    vision = np.array(df['prev_vision'])
    starting_vision = vision[0]
    vision = vision - starting_vision
    # poly_fit = np.polyfit(time, vision, np.round(np.sqrt(len(time))))
    results_list = []
    rnge = int(np.round(len(time)/2, 1))
    for i in range(2, rnge):
        poly_fit = np.polyfit(time, vision, i)
        coefs = poly_fit.tolist()
        p = Polynomial(coefs)
        # function we want to integrate
        def f(x):
            return p(x)
        # call quad to integrate f from 0 to last visit
        res, err = quad(f, 0, time[-1])
        results_list.append(res)
    
    # call quad to integrate f from 0 to last visit
    res = np.mean(results_list)
    
    return res / len(time)

### Dataframe generation

In [6]:
def dataframe_gen(df, pat_id):
    df = df[df["ID"] == pat_id]
    b_point = best_point(df)
    above = above_start(df)
    best_f = best_first(df)
    AUC = AUC_vision(df)
    data = {'mean_adherence': [mean_adherence(df)], 'stdev_ad': [stdev_ad(df)],
            'freq_late': [freq_late(df)], 'ext_late': [extreme_late(df)],
            'ext_early': [extreme_early(df)], 'int_late': [int_length_late(df)],
            'int_early': [int_length_early(df)],
           'best_point': [b_point], 'above_start': [above], 
            'best_first': [best_f], 'auc': [AUC]}
    new_df = pd.DataFrame(data)
    return new_df

In [7]:
id_list = df["ID"].unique()
dataframe_gen(df, id_list[0])

Unnamed: 0,mean_adherence,stdev_ad,freq_late,ext_late,ext_early,int_late,int_early,best_point,above_start,best_first,auc
0,1.021818,0.051313,0.272727,1.093333,0.96,4.38,3.86,0.85,0.27,0.033333,309.932878


In [8]:
def master_dataframe(df):
    id_list = df["ID"].unique()
    frames = []
    for i in range(len(id_list)):
        try:
            pdf = dataframe_gen(df, id_list[i])
            pdf['ID'] = i
            frames.append(pdf)
        except:
            i += 1
    master = pd.concat(frames)
    master.reset_index(inplace=True)
    master.drop(columns=['index'], inplace=True)
    return master

In [11]:
initiation_df = master_dataframe(df)
initiation_df.head()

Unnamed: 0,mean_adherence,stdev_ad,freq_late,ext_late,ext_early,int_late,int_early,best_point,above_start,best_first,auc,ID
0,1.021818,0.051313,0.272727,1.093333,0.96,4.38,3.86,0.85,0.27,0.033333,309.932878,0
1,1.0,0.0,0.0,1.0,1.0,10.0,10.0,0.29,0.0,0.0,,1
2,1.021111,0.149773,0.166667,1.29,0.836667,7.573333,4.426667,0.16,0.61,0.5,22.364874,2
3,1.784,2.410279,0.3,3.783333,0.83,8.38,6.856667,0.09,0.0,0.0,-6.901273,3
4,1.012414,0.070937,0.206897,1.126667,0.9,4.501667,3.605,0.05,0.0,0.0,15.573127,4


In [12]:
initiation_df.to_csv('/home/jupyter/macuject_automated/Adherence_paper/initiation_all_stats.csv')

## Class: Stat_Test

In [13]:
df = pd.read_csv('/home/jupyter/macuject_automated/Adherence_paper/initiation_all_stats.csv')
df = df[df['auc'] < 10000]
df = df[df['auc'] > -10000]

In [66]:
class Stat_Test:
    
    def __init__(self, ad_measure, vis_outcome):
        self.ad_measure = ad_measure
        self.vis_outcome = vis_outcome
    
    def measures_print(self):
        print("The adherence measure is {}, and the visual outcome is {}.".format(self.ad_measure, self.vis_outcome))
        
    def median(self):
        median = np.median(df[self.ad_measure])
        return median
        
    def samples(self):
        median = np.median(df[self.ad_measure])
        early = df[df[self.ad_measure] < median]
        late = df[df[self.ad_measure] >= median]
        sample_1 = early[self.vis_outcome]
        sample_2 = late[self.vis_outcome]
        return sample_1, sample_2, median
    
    def t_test_ind(self):
        sample_1, sample_2, median = self.samples()
        t, pval = stats.ttest_ind(sample_1, sample_2)
        return t, pval
        
    def mann_whitney(self):
        sample_1, sample_2, median = self.samples()
        t, pval = stats.mannwhitneyu(sample_1, sample_2)
        return t, pval
            
    def all_tests(self):
        t_1, p_1 = self.t_test_ind()
        t_2, p_2 = self.mann_whitney()
        sample_1, sample_2, cutoff = self.samples()
        median = np.round(cutoff, 2)
        x_1, x_2 = np.mean(sample_1), np.mean(sample_2)
        print("Those in Group 1 had adherence measure less than {}, and those in Group 2 had adherence measure greater than {}.".format(median, median))
        print("The mean for Early is {} and the mean for Late is {}.".format(np.round(x_1, 2), np.round(x_2, 2)))
        print("The test statistic for the independent t-test is {} and the p-value is {}.".format(np.round(t_1, 3), 
                                                                                                     np.round(p_1, 5)))
        print("The test statistic for the parametric t-test is {} and the p-value is {}.".format(np.round(t_2, 3), 
                                                                                                     np.round(p_2, 5)))
        if p_1 < 0.05 and p_2 > 0.05:
            print("The independent t-test returned a significant result whilst the parametric t-test did not.")
        elif p_1 > 0.05 and p_2 < 0.05:
            print("The parametric t-test returned a significant result whilst the independent t-test did not.")
        elif p_1 < 0.05 and p_2 < 0.05:
            print("Both tests returned a significant result.")
        else:
            print("Neither test returned a significant result. It is likely these are the same distribution.")
            
    # rotating samples function
    def samples_search(self):
        minimum = np.round(df[self.ad_measure].min(), 2)
        maximum = np.round(df[self.ad_measure].max(), 2)
        data_range = np.linspace(minimum, maximum, 100)
        diff, cut_point, len_s1, len_s2, mean_1, mean_2, p_value = 0, 0, 0, 0, 0, 0, 100
        for i in range(len(data_range)):
            cutoff = data_range[i]
            early = df[df[self.ad_measure] < cutoff]
            late = df[df[self.ad_measure] >= cutoff]
            if len(early) != 0 and len(late) != 0:
                sample_1, sample_2 = early[self.vis_outcome], late[self.vis_outcome]
                x_1, x_2 = np.mean(sample_1), np.mean(sample_2)
                t, pval = self.mann_whitney()
                if pval < 0.05 and abs(x_1 - x_2) > diff and len(sample_1) > 100 and len(sample_2) > 100:
                    difference = abs(x_1 - x_2)
                    diff = np.round(difference, 4)
                    mean_1, mean_2 = np.round(x_1, 2), np.round(x_2, 2)
                    cut_point = np.round(cutoff, 4)
                    len_s1, len_s2 = len(sample_1), len(sample_2)
                    p_value = np.round(pval, 5)
                    i += 1
                else: 
                    i += 1
        return diff, cut_point, len_s1, len_s2, mean_1, mean_2, p_value

    def confidence_interval(self):
        sample_1, sample_2, cutoff = self.samples()
        cm = sms.CompareMeans(sms.DescrStatsW(sample_1), sms.DescrStatsW(sample_2))
        conf_int = cm.tconfint_diff(usevar='unequal')
        return conf_int
    
    def own_confint(self):
        sample_1, sample_2, cutoff = self.samples()
        x_1, x_2 = np.mean(sample_1), np.mean(sample_2)
        n_1, n_2 = len(sample_1), len(sample_2)
        std_1, std_2 = np.std(sample_1), np.std(sample_2)
        diff = x_1 - x_2
        mult = 1.96*np.sqrt(((std_1**2)/n_1) + ((std_2**2)/n_2))
        lower, upper = (diff - mult), (diff + mult)
        return (lower, upper)

    
    # print results from sample_search
    def print_samples_search(self):
        diff, cut_point, len_s1, len_s2, mean_1, mean_2, p_value = self.samples_search()
        ci = self.own_confint()
        if diff > 0:
            print("Adherence measure: {}".format(self.ad_measure))
            print("The cutoff point which maximised the difference in visual outcome between the groups is {}.".format(cut_point))
            print("This gave a p-value of {}, with {} people in the EARLY GROUP and {} people in the LATE GROUP.".format(p_value, len_s1, len_s2))
            print("The early group had mean {} of {}, and the late group had mean {} of {}.".format(self.vis_outcome, mean_1, self.vis_outcome, mean_2))
            print("Thus, the maximum difference in {} between the two groups (that maintained significance) was {}.".format(self.vis_outcome, diff))
            print("This gave a confidence interval of {}.".format(ci))
        else:
            print("There was no cutoff point for {} that found a significant difference in {}.".format(self.ad_measure, self.vis_outcome))
            
p1 = Stat_Test("ext_late", "best_first")
p1.print_samples_search()

Adherence measure: ext_late
The cutoff point which maximised the difference in visual outcome between the groups is 1.1292.
This gave a p-value of 0.02516, with 350 people in the EARLY GROUP and 330 people in the LATE GROUP.
The early group had mean best_first of 0.2, and the late group had mean best_first of 0.22.
Thus, the maximum difference in best_first between the two groups (that maintained significance) was 0.0271.
This gave a confidence interval of (-0.058583960278429506, 0.0038495052532944857).


In [64]:
def own_confint(sample_1, sample_2):
        x_1, x_2 = np.mean(sample_1), np.mean(sample_2)
        n_1, n_2 = len(sample_1), len(sample_2)
        std_1, std_2 = np.std(sample_1), np.std(sample_2)
        diff = x_1 - x_2
        mult = 1.96*np.sqrt(((std_1**2)/n_1) + ((std_2**2)/n_2))
        lower, upper = (diff - mult), (diff + mult)
        return (lower, upper)
    
def samples():
        median = np.median(df['ext_late'])
        early = df[df['ext_late'] < median]
        late = df[df['ext_late'] >= median]
        sample_1 = early['best_first']
        sample_2 = late['best_first']
        return sample_1, sample_2

a, b = samples()

def confidence_interval(sample_1, sample_2):
        cm = sms.CompareMeans(sms.DescrStatsW(sample_1), sms.DescrStatsW(sample_2))
        conf_int = cm.tconfint_diff(usevar='unequal')
        return conf_int
    
own_confint(a, b)

(-0.058583960278429506, 0.0038495052532944857)

In [65]:
confidence_interval(a,b)

(-0.05868543078456793, 0.003950975759432908)

In [47]:
def confidence_interval(sample_1, sample_2):
        cm = sms.CompareMeans(sms.DescrStatsW(sample_1), sms.DescrStatsW(sample_2))
        ci = cm.tconfint_diff(usevar='unequal')
        return ci
    
X1, X2 = np.arange(10,21), np.arange(20,26.5,.5)
ci = confidence_interval(X1, X2)
print("The confidence interval is {}.".format(ci))

The confidence interval is (-10.414599391793885, -5.585400608206114).


In [16]:
def all_cutoff_points():
    ad_measures = ['mean_adherence', 'stdev_ad', 'freq_late', 'ext_late', 'ext_early']
    vis_outcomes = ['best_point', 'above_start', 'best_first', 'auc']
    # create the dataframe
    results = pd.DataFrame(columns = ['mean_adherence', 'stdev_ad', 'freq_late', 'ext_late',
       'ext_early'], index = ['best_point', 'above_start', 'best_first', 'auc'])
    for i in range(len(ad_measures)):
        for j in range(len(vis_outcomes)):
            p1 = Stat_Test(ad_measures[i], vis_outcomes[j])
            diff, cut_point, len_s1, len_s2, mean_1, mean_2, p_value = p1.samples_search()
            results[ad_measures[i]][vis_outcomes[j]] = cut_point
    return results
            
results_initiation = all_cutoff_points()
results_initiation.to_excel("results_initiation.xlsx")

In [68]:
def all_cutoff_points_print():
    ad_measures = ['mean_adherence', 'stdev_ad', 'freq_late', 'ext_late', 'ext_early']
    vis_outcomes = ['best_point', 'above_start', 'best_first', 'auc']
    for i in range(len(ad_measures)):
        for j in range(len(vis_outcomes)):
            p1 = Stat_Test(ad_measures[i], vis_outcomes[j])
            p1.print_samples_search()
            print("       ")
            
all_cutoff_points_print()

There was no cutoff point for mean_adherence that found a significant difference in best_point.
       
There was no cutoff point for mean_adherence that found a significant difference in above_start.
       
There was no cutoff point for mean_adherence that found a significant difference in best_first.
       
There was no cutoff point for mean_adherence that found a significant difference in auc.
       
There was no cutoff point for stdev_ad that found a significant difference in best_point.
       
There was no cutoff point for stdev_ad that found a significant difference in above_start.
       
There was no cutoff point for stdev_ad that found a significant difference in best_first.
       
There was no cutoff point for stdev_ad that found a significant difference in auc.
       
There was no cutoff point for freq_late that found a significant difference in best_point.
       
There was no cutoff point for freq_late that found a significant difference in above_start.
       
There