In [70]:
import pandas as pd
import numpy as np
from scipy import stats

import seaborn as sns
import statsmodels.api as sm
import statsmodels.stats.api as sms

from ipynb.fs.defs.ImportDB import *

import warnings
warnings.filterwarnings('ignore')

In [71]:
df = pd.read_csv('/home/jupyter/charliemacuject/research_papers/data/adherence_initiation.csv', low_memory=False)

In [67]:
df = df[df['time_above_baseline_1year'] != 'nil']
df['time_above_baseline_1year'] = pd.to_numeric(df['time_above_baseline_1year'])
df.dropna(subset=['time_above_baseline_1year'], inplace=True)
df = df[df['time_above_baseline_1year'] >= 0]
df = df[df['time_above_baseline_1year'] < 365]

In [59]:
df = df[df['time_above_baseline_2year'] != 'nil']
df['time_above_baseline_2year'] = pd.to_numeric(df['time_above_baseline_2year'])
df.dropna(subset=['time_above_baseline_2year'], inplace=True)
df = df[df['time_above_baseline_2year'] >= 0]
df = df[df['time_above_baseline_2year'] < 365]

In [62]:
df = df[df['time_above_baseline_4year'] != 'nil']
df['time_above_baseline_4year'] = pd.to_numeric(df['time_above_baseline_4year'])
df.dropna(subset=['time_above_baseline_4year'], inplace=True)
df = df[df['time_above_baseline_4year'] >= 0]
df = df[df['time_above_baseline_4year'] < 365]

In [73]:
class Stat_Test:
    
    def __init__(self, ad_measure, vis_outcome):
        self.ad_measure = ad_measure
        self.vis_outcome = vis_outcome
    
    def measures_print(self):
        print("The adherence measure is {}, and the visual outcome is {}.".format(self.ad_measure, self.vis_outcome))
        
    def median(self):
        median = np.median(df[self.ad_measure])
        return median
        
    def samples(self):
        median = np.median(df[self.ad_measure])
        early = df[df[self.ad_measure] < median]
        late = df[df[self.ad_measure] >= median]
        sample_1 = early[self.vis_outcome]
        sample_2 = late[self.vis_outcome]
        return sample_1, sample_2, median
    
    def t_test_ind(self):
        sample_1, sample_2, median = self.samples()
        t, pval = stats.ttest_ind(sample_1, sample_2)
        return t, pval
        
    def mann_whitney(self):
        sample_1, sample_2, median = self.samples()
        t, pval = stats.mannwhitneyu(sample_1, sample_2)
        return t, pval
            
    def all_tests(self):
        t_1, p_1 = self.t_test_ind()
        t_2, p_2 = self.mann_whitney()
        sample_1, sample_2, cutoff = self.samples()
        median = np.round(cutoff, 2)
        x_1, x_2 = np.mean(sample_1), np.mean(sample_2)
        print("Those in Group 1 had adherence measure less than {}, and those in Group 2 had adherence measure greater than {}.".format(median, median))
        print("The mean for Early is {} and the mean for Late is {}.".format(np.round(x_1, 2), np.round(x_2, 2)))
        print("The test statistic for the independent t-test is {} and the p-value is {}.".format(np.round(t_1, 3), 
                                                                                                     np.round(p_1, 5)))
        print("The test statistic for the parametric t-test is {} and the p-value is {}.".format(np.round(t_2, 3), 
                                                                                                     np.round(p_2, 5)))
        if p_1 < 0.05 and p_2 > 0.05:
            print("The independent t-test returned a significant result whilst the parametric t-test did not.")
        elif p_1 > 0.05 and p_2 < 0.05:
            print("The parametric t-test returned a significant result whilst the independent t-test did not.")
        elif p_1 < 0.05 and p_2 < 0.05:
            print("Both tests returned a significant result.")
        else:
            print("Neither test returned a significant result. It is likely these are the same distribution.")
            
    # rotating samples function
    def samples_search(self):
        minimum = np.round(df[self.ad_measure].min(), 2)
        maximum = np.round(df[self.ad_measure].max(), 2)
        data_range = np.linspace(minimum, maximum, 100)
        diff, cut_point, len_s1, len_s2, mean_1, mean_2, p_value = 0, 0, 0, 0, 0, 0, 100
        for i in range(len(data_range)):
            cutoff = data_range[i]
            early = df[df[self.ad_measure] < cutoff]
            late = df[df[self.ad_measure] >= cutoff]
            if len(early) != 0 and len(late) != 0:
                sample_1, sample_2 = early[self.vis_outcome], late[self.vis_outcome]
                x_1, x_2 = np.mean(sample_1), np.mean(sample_2)
                t, pval = self.mann_whitney()
                if pval < 0.05 and abs(x_1 - x_2) > diff and len(sample_1) > 100 and len(sample_2) > 100:
                    difference = abs(x_1 - x_2)
                    diff = np.round(difference, 4)
                    mean_1, mean_2 = np.round(x_1, 2), np.round(x_2, 2)
                    cut_point = np.round(cutoff, 4)
                    len_s1, len_s2 = len(sample_1), len(sample_2)
                    p_value = np.round(pval, 5)
                    i += 1
                else: 
                    i += 1
        return diff, cut_point, len_s1, len_s2, mean_1, mean_2, p_value

    def confidence_interval(self, sample_1, sample_2):
        cm = sms.CompareMeans(sms.DescrStatsW(sample_1), sms.DescrStatsW(sample_2))
        conf_int = cm.tconfint_diff(usevar='unequal')
        return conf_int
    
    def samples_ci(self, cut):
        early = df[df[self.ad_measure] < cut]
        late = df[df[self.ad_measure] >= cut]
        sample_1 = early[self.vis_outcome]
        sample_2 = late[self.vis_outcome]
        return sample_1, sample_2

    
    # print results from sample_search
    def print_samples_search(self):
        diff, cut_point, len_s1, len_s2, mean_1, mean_2, p_value = self.samples_search()
        sample_1, sample_2 = self.samples_ci(cut_point)
        ci = self.confidence_interval(sample_1,sample_2)
        if diff > 0:
            print("Adherence measure: {}".format(self.ad_measure))
            print("The cutoff point which maximised the difference in visual outcome between the groups is {}.".format(cut_point))
            print("This gave a p-value of {}, with {} people in the EARLY GROUP and {} people in the LATE GROUP.".format(p_value, len_s1, len_s2))
            print("The early group had mean {} of {}, and the late group had mean {} of {}.".format(self.vis_outcome, mean_1, self.vis_outcome, mean_2))
            print("Thus, the maximum difference in {} between the two groups (that maintained significance) was {}.".format(self.vis_outcome, diff))
            print("This gave a confidence interval of {}.".format(ci))
        else:
            print("There was no cutoff point for {} that found a significant difference in {}.".format(self.ad_measure, self.vis_outcome))

In [68]:
p1 = Stat_Test("percentage_late", "time_above_baseline_1year")
p1.print_samples_search()

There was no cutoff point for percentage_late that found a significant difference in time_above_baseline_1year.


In [6]:
def all_combinations():
    results = pd.DataFrame(columns = ['mean_adherence', 'adherence_variation',
       'percentage_late', 'MAFL', 'MAFE'], index = ['mean_vision',
       'time_above_baseline_1year', 'time_above_baseline_2year',
       'time_above_baseline_4year', 'proportion_above_baseline',
       'peak_visual_improvement', 'loss_from_peak', 'time_to_peak'])
    ad_measures = ['mean_adherence', 'adherence_variation',
       'percentage_late', 'MAFL', 'MAFE']
    vis_outcomes = ['mean_vision',
       'time_above_baseline_1year', 'time_above_baseline_2year',
       'time_above_baseline_4year', 'proportion_above_baseline',
       'peak_visual_improvement', 'loss_from_peak', 'time_to_peak']
    for i in range(len(ad_measures)):
        for j in range(len(vis_outcomes)):
            p1 = Stat_Test(ad_measures[i], vis_outcomes[j])
            t, pval = p1.t_test_ind()
            results[ad_measures[i]][vis_outcomes[j]] = pval
    return results

In [11]:
df = df[df['time_above_baseline_1year'] != 'nil']

In [13]:
df['time_above_baseline_1year']

0        0
1        0
2      343
3       69
4       28
      ... 
461    105
463      0
465     28
466      0
467     91
Name: time_above_baseline_1year, Length: 329, dtype: object

In [15]:
p1 = Stat_Test("percentage_late", "loss_from_peak")
p1.all_tests()

Those in Group 1 had adherence measure less than 0.2, and those in Group 2 had adherence measure greater than 0.2.
The mean for Early is 0.18 and the mean for Late is 0.2.
The test statistic for the independent t-test is -1.442 and the p-value is 0.15019.
The test statistic for the parametric t-test is 12289.0 and the p-value is 0.07541.
Neither test returned a significant result. It is likely these are the same distribution.


In [74]:
def all_cutoff_points_print():
    ad_measures = ['mean_adherence', 'adherence_variation',
       'percentage_late', 'MAFL', 'MAFE']
    vis_outcomes = ['mean_vision', 'proportion_above_baseline',
       'peak_visual_improvement', 'loss_from_peak', 'time_to_peak']
    for i in range(len(ad_measures)):
        for j in range(len(vis_outcomes)):
            p1 = Stat_Test(ad_measures[i], vis_outcomes[j])
            p1.print_samples_search()
            print("       ")
            
all_cutoff_points_print()

There was no cutoff point for mean_adherence that found a significant difference in mean_vision.
       
Adherence measure: mean_adherence
The cutoff point which maximised the difference in visual outcome between the groups is 1.0071.
This gave a p-value of 0.00061, with 261 people in the EARLY GROUP and 129 people in the LATE GROUP.
The early group had mean proportion_above_baseline of 0.35, and the late group had mean proportion_above_baseline of 0.25.
Thus, the maximum difference in proportion_above_baseline between the two groups (that maintained significance) was 0.1012.
This gave a confidence interval of (0.03802267697097807, 0.16445259701993353).
       
Adherence measure: mean_adherence
The cutoff point which maximised the difference in visual outcome between the groups is 0.9975.
This gave a p-value of 0.0, with 176 people in the EARLY GROUP and 214 people in the LATE GROUP.
The early group had mean peak_visual_improvement of 0.24, and the late group had mean peak_visual_impro

In [18]:
pdf = df[df['time_above_baseline_1year'] != 'nil']

In [None]:
pdf = df[df['time_above_baseline_1year'] != 'nil']
t, pval = stats.mannwhitneyu(sample_1, sample_2)