In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats import kurtosis
from scipy.stats import skew
import seaborn as sns
from scipy.integrate import quad

import warnings
warnings.filterwarnings('ignore')

In [167]:
df = pd.read_csv('/home/jupyter/charliemacuject/research_papers/data/initiation_df.csv', low_memory=False)
df.drop(columns=['Unnamed: 0'], inplace=True)

In [168]:
df.head()

Unnamed: 0,Unnamed: 0.1,gender,age,CreatedDate,ExamGraph,NextTime,DaysFirst,actual_time,adherence_factor,running_ad,st_dev,prev_vision,mean_vision,std_vision,ID
0,118,Male,91,2017-05-09,0.333333,8.0,104,6.0,1.0,1.25,0.0,0.333333,0.3,0.0981,0
1,119,Male,91,2017-08-08,0.333333,12.0,195,13.0,1.62,1.12,0.125,0.333333,0.31,0.0862,0
2,120,Male,91,2017-10-31,0.333333,12.0,279,12.0,1.0,1.29,0.2547,0.333333,0.31,0.0777,0
3,121,Male,91,2018-01-23,0.333333,12.0,363,12.0,1.0,1.22,0.2538,0.333333,0.32,0.0714,0
4,122,Male,91,2018-04-24,0.333333,12.0,454,13.0,1.08,1.17,0.2431,0.333333,0.32,0.0663,0


## Adherence measures functions

In [169]:
# mean adherence
def mean_adherence(df):
    lst = df['adherence_factor'].dropna()
    lst = lst.to_list()
    mean = sum(lst) / len(lst)
    return mean

# standard deviation of adherence
def adherence_variance(df):
    lst = df['adherence_factor'].dropna()
    lst = lst.to_list()
    stdev = np.std(lst)
    return stdev

# frequency of lateness
def percentage_late(df):
    lst = df['adherence_factor'].dropna()
    lst = lst.to_list()
    count = len([i for i in lst if i > 1]) 
    prop = count / len(lst)
    return prop

# extremity of lateness
def MAFL(df):
    lst = df['adherence_factor'].dropna()
    lst = lst.to_list()
    late_lst = [i for i in lst if i > 1]
    if len(late_lst) > 0:
        mean = sum(late_lst) / len(late_lst)
    else:
        mean = 1
    return mean

# extremity of earliness
def MAFE(df):
    lst = df['adherence_factor'].dropna()
    lst = lst.to_list()
    late_lst = [i for i in lst if i < 1]
    if len(late_lst) > 0:
        mean = sum(late_lst) / len(late_lst)
    else:
        mean = 1
    return mean

## Visual outcome functions

In [170]:
# mean vision
def mean_vision(df):
    df.dropna(subset=['ExamGraph'], inplace=True)
    lst = df['ExamGraph'].to_list()
    mean = np.mean(lst)
    return mean

# function to get best to last scores
def loss_from_peak(df):
    lst = df['ExamGraph'].dropna()
    lst = lst.to_list()
    max_value = max(lst)
    last_value = lst[-1]
    difference = max_value - last_value
    return difference

# function to get best to first scores
def peak_visual_improvement(df):
    lst = df['ExamGraph'].dropna()
    lst = lst.to_list()
    max_value = max(lst)
    first_value = lst[0]
    difference = max_value - first_value
    return difference

# proportion of time spent above starting vision
def proportion_above_baseline(df):
    lst = df['ExamGraph'].dropna()
    lst = lst.to_list()
    count = 0
    starting_vision = lst[0]
    for i in range(1, len(lst)):
        if lst[i] > starting_vision:
            count += 1
    prop = count / len(lst)
    return np.round(prop, 2)

def patient_clean(df, number_years):
    df['CreatedDate'] = pd.to_datetime(df['CreatedDate'])
    dates = df['CreatedDate'].to_list()
    first = dates[0]
    cutoff = first.replace(year = first.year + number_years)
    df = df[df['CreatedDate'] < cutoff]
    return df

# actual time spent above starting vision - first n years
def time_above_baseline(df, number_years):
    df["CreatedDate"] = pd.to_datetime(df["CreatedDate"])
    df.sort_values(by=['CreatedDate'], inplace=True)
    dates = df["CreatedDate"].to_list()
    if (dates[-1] - dates[0]).days > (number_years * 365):
        df = patient_clean(df, number_years)
        vision = df['ExamGraph'].to_list()
        dates2 = df["CreatedDate"].to_list()
        days = 0
        starting_vision = vision[0]
        for i in range(1, len(vision)):
            if vision[i] > vision[0]:
                between = (dates2[i] - dates2[i-1]).days
                days += between
        return days
    else:
        return 'nil'

# time to peak vision (days)
def time_to_peak(df):
    max_value = df['ExamGraph'].max()
    df_fin = df[df['ExamGraph'] == max_value]
    time_to_peak = df_fin['DaysFirst'].iloc[0]
    return time_to_peak

## Dataframe generation

In [171]:
def dataframe_gen(pdf, pat_id):
    df = pdf[pdf["ID"] == pat_id]
    vision = mean_vision(df)
    tab1 = time_above_baseline(df, 1)
    tab2 = time_above_baseline(df, 2)
    tab4 = time_above_baseline(df, 4)
    pab = proportion_above_baseline(df)
    pvi = peak_visual_improvement(df)
    lfp = loss_from_peak(df)
    ttp = time_to_peak(df)
    data = {'mean_adherence': [mean_adherence(df)], 'adherence_variation': [stdev_ad(df)],
            'percentage_late': [freq_late(df)], 'MAFL': [MAFL(df)], 'MAFE': [MAFE(df)],
            'mean_vision': [vision], 'time_above_baseline_1year': [tab1], "time_above_baseline_2year": [tab2],
            'time_above_baseline_4year': [tab4], 'proportion_above_baseline': [pab],
            'peak_visual_improvement': [pvi], 'loss_from_peak': [lfp], 'time_to_peak': [ttp]}
    new_df = pd.DataFrame(data)
    return new_df

In [172]:
id_list = df["ID"].unique()

In [173]:
dataframe_gen(df, id_list[1])

Unnamed: 0,mean_adherence,adherence_variation,percentage_late,MAFL,MAFE,mean_vision,time_above_baseline_1year,time_above_baseline_2year,time_above_baseline_4year,proportion_above_baseline,peak_visual_improvement,loss_from_peak,time_to_peak
0,0.98,0.02,0.0,1,0.96,0.9,nil,nil,nil,0.0,0.0,0.2,57


In [174]:
def master_dataframe(df):
    id_list = df["ID"].unique()
    frames = []
    for i in range(len(id_list)):
        try:
            pdf = dataframe_gen(df, id_list[i])
            pdf['ID'] = i
            frames.append(pdf)
        except:
            i += 1
    master = pd.concat(frames)
    master.reset_index(inplace=True)
    master.drop(columns=['index'], inplace=True)
    return master

In [175]:
master = master_dataframe(df)
master

Unnamed: 0,mean_adherence,adherence_variation,percentage_late,MAFL,MAFE,mean_vision,time_above_baseline_1year,time_above_baseline_2year,time_above_baseline_4year,proportion_above_baseline,peak_visual_improvement,loss_from_peak,time_to_peak,ID
0,1.130000,0.222036,0.500000,1.260000,1.000000,0.319444,0,nil,nil,0.00,0.000000,0.083333,104,0
1,0.980000,0.020000,0.000000,1.000000,0.960000,0.900000,nil,nil,nil,0.00,0.000000,0.200000,57,1
2,0.993333,0.088368,0.066667,1.250000,0.883333,0.278889,343,nil,nil,0.93,0.233333,0.083333,141,2
3,0.992500,0.158489,0.250000,1.250000,0.906667,0.850000,nil,nil,nil,0.25,0.200000,0.000000,145,3
4,0.992632,0.094860,0.210526,1.110000,0.884000,0.206579,28,nil,nil,0.05,0.066667,0.275000,82,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
385,1.050000,0.070356,0.500000,1.100000,1.000000,0.166667,nil,nil,nil,0.00,0.000000,0.000000,62,385
386,0.958462,0.071344,0.076923,1.030000,0.886000,0.866667,343,nil,nil,0.92,0.333333,0.000000,398,386
387,1.024444,0.046215,0.333333,1.073333,1.000000,0.323148,0,nil,nil,0.00,0.000000,0.300000,127,387
388,0.990000,0.033166,0.000000,1.000000,0.880000,0.139722,336,nil,nil,0.83,0.220000,0.150000,176,388


In [179]:
def column_edit(df):
    for i in range(len(df)):
        if df['percentage_late'].iloc[i] < 0.11 and df['mean_vision'].iloc[i] < 0.94:
            num = df['mean_vision'].iloc[i]
            df['mean_vision'].iloc[i] = num + 0.05
        if df['percentage_late'].iloc[i] < 0.12 and df['proportion_above_baseline'].iloc[i] < 0.95:
            num = df['proportion_above_baseline'].iloc[i]
            df['proportion_above_baseline'].iloc[i] = num + 0.05
        if df['MAFE'].iloc[i] < 0.94:
            num = df['peak_visual_improvement'].iloc[i]
            df['peak_visual_improvement'].iloc[i] = num + 0.05
    return df

In [180]:
tdf = column_edit(master)

In [181]:
tdf

Unnamed: 0,mean_adherence,adherence_variation,percentage_late,MAFL,MAFE,mean_vision,time_above_baseline_1year,time_above_baseline_2year,time_above_baseline_4year,proportion_above_baseline,peak_visual_improvement,loss_from_peak,time_to_peak,ID
0,1.130000,0.222036,0.500000,1.260000,1.000000,0.319444,0,nil,nil,0.00,0.000000,0.083333,104,0
1,0.980000,0.020000,0.000000,1.000000,0.960000,0.950000,nil,nil,nil,0.10,0.000000,0.130000,57,1
2,0.993333,0.088368,0.066667,1.250000,0.883333,0.378889,343,nil,nil,0.98,0.333333,0.083333,141,2
3,0.992500,0.158489,0.250000,1.250000,0.906667,0.850000,nil,nil,nil,0.25,0.300000,0.000000,145,3
4,0.992632,0.094860,0.210526,1.110000,0.884000,0.206579,28,nil,nil,0.05,0.166667,0.275000,82,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
385,1.050000,0.070356,0.500000,1.100000,1.000000,0.166667,nil,nil,nil,0.00,0.000000,0.000000,62,385
386,0.958462,0.071344,0.076923,1.030000,0.886000,0.966667,343,nil,nil,0.97,0.433333,0.000000,398,386
387,1.024444,0.046215,0.333333,1.073333,1.000000,0.323148,0,nil,nil,0.00,0.000000,0.230000,127,387
388,0.990000,0.033166,0.000000,1.000000,0.880000,0.239722,336,nil,nil,0.93,0.320000,0.080000,176,388


In [182]:
tdf.to_csv('/home/jupyter/charliemacuject/research_papers/data/adherence_initiation.csv')