In [1]:
import numpy as np
import pandas as pd
import math
from scipy import stats
from scipy.stats import chisquare, chi2_contingency, ttest_ind, contingency, pearsonr
import matplotlib.pyplot as plt
import seaborn as sns

import importlib
import sys

sys.path.append('/mnt/shared_folders/eResearch_glaucoma_project/emma_summer2023/honours')
import data_functions
importlib.reload(data_functions)



<module 'data_functions' from '/home/eake191/resmed202100066-Glaucoma_PRS/emma_summer2023/honours/data_functions.py'>

<br>

### Loading data

In [2]:
defs = pd.read_csv('incidence_definition/glaucoma_definitions.csv')

In [3]:
date_attending = pd.read_pickle('./data/derived/mixed_derived_and_extracted_merged.pkl') 
date_attending = date_attending[['f.eid', 'f.53.0.0']].rename(columns={'f.53.0.0': 'date_attending_assessment_center'})

In [4]:
defs = pd.merge(defs, date_attending, on='f.eid')

In [5]:
defs = defs[['f.eid', 'glaucoma_diagnosis_type', 'incident_glaucoma_diagnosis_source', 
      'earliest_age_glaucoma_diagnosed', 'earliest_incident_diagnosis_date', 'date_attending_assessment_center']]
defs.columns = ['f.eid', 'glaucoma_control', 'diagnosis_source', 'age_diagnosed', 'date_diagnosed', 'date_attending_assessment_center']

<br>

#### Time to diagnosis

In [6]:
defs['date_attending_assessment_center'] = pd.to_datetime(defs['date_attending_assessment_center'], format='%Y-%m-%d')
defs['date_diagnosed'] = pd.to_datetime(defs['date_diagnosed'], format='%Y-%m-%d')


defs['time_to_diagnosis'] = ((defs['date_diagnosed'] - defs['date_attending_assessment_center']).dt.days / 365.25)

In [7]:
defs['tte_3year'] = np.where(defs['time_to_diagnosis'] < 3, 'Glaucoma', 
                  np.where(defs['glaucoma_control'] == 'Control', 'Control', 'NaN'))
defs['tte_5year'] = np.where((defs['time_to_diagnosis'] < 5), 'Glaucoma', 
                  np.where(defs['glaucoma_control'] == 'Control', 'Control', 'NaN'))
defs['tte_10year'] = np.where((defs['time_to_diagnosis'] < 10), 'Glaucoma', 
                  np.where(defs['glaucoma_control'] == 'Control', 'Control', 'NaN'))

In [11]:
defs['tte_3year'].value_counts()
#Control     485772
#NaN          15325
#Glaucoma      1322
defs['tte_5year'].value_counts()
#Control     485772
#NaN          14271
#Glaucoma      2376
defs['tte_10year'].value_counts()
#Control     485772
#NaN          10829
#Glaucoma      5818

tte_10year
Control     485772
NaN          10829
Glaucoma      5818
Name: count, dtype: int64

In [11]:
defs['diagnosis_source'].value_counts()

diagnosis_source
Inpatient                  4430
GP                         1693
GP & Inpatient              825
Self-report                 675
Self-report & Inpatient     349
Self-report & GP             98
Name: count, dtype: int64

In [12]:
defs.loc[defs['diagnosis_source'] == 'Self-report']

Unnamed: 0,f.eid,glaucoma_control,diagnosis_source,age_diagnosed,date_diagnosed,date_attending_assessment_center,time_to_diagnosis,tte_3year,tte_5year,tte_10year
683,1006847,Incident,Self-report,67.3,NaT,2006-04-18,,,,
1418,1014190,Incident,Self-report,56.0,NaT,2008-11-28,,,,
1741,1017429,Incident,Self-report,65.0,NaT,2008-04-08,,,,
1791,1017923,Incident,Self-report,48.0,NaT,2008-07-11,,,,
2814,1028150,Incident,Self-report,73.0,NaT,2008-04-03,,,,
...,...,...,...,...,...,...,...,...,...,...
499294,5992958,Incident,Self-report,54.0,NaT,2009-12-19,,,,
499998,5999994,Incident,Self-report,,NaT,2009-03-10,,,,
500409,6004104,Incident,Self-report,66.9,NaT,2008-07-15,,,,
501004,6010053,Incident,Self-report,45.0,NaT,2010-03-01,,,,


In [18]:
data = {
    'ID': defs['f.eid'],
    'IOP_removed_3': defs['tte_3year'],
    'IOP_removed_5': defs['tte_5year'],
    'IOP_removed_10': defs['tte_10year'] }

df = pd.DataFrame(data)
df.to_csv('william.csv', index=False)

In [19]:
# save incidence data only
defs.to_csv('incidence_glaucoma.csv', index=False)

<br>

### 90-10 training/testing split

In [20]:
merged_df = pd.read_pickle('/mnt/shared_folders/eResearch_glaucoma_project/emma_summer2023/honours/data/derived/derived_cols_merged.pkl')
merged_df = pd.merge(defs, merged_df, on='f.eid')
merged_df

Unnamed: 0,f.eid,glaucoma_control,diagnosis_source,age_diagnosed,date_diagnosed,date_attending_assessment_center,time_to_diagnosis,tte_3year,tte_5year,tte_10year,...,Normal sleep duration,Insomnia frequency,Snoring,Daytime sleeping frequency,Exclusion,Glaucoma (prevalent D|TD),IOP subcohort,training_test_split_90_10,Exercise (summed MET minutes per week),training_test_split_80_20
0,1000011,Control,,,NaT,2008-04-22,,Control,Control,Control,...,1.0,2.0,0.0,0.0,0.0,Control,0,train,1668.0,train
1,1000026,Control,,,NaT,2009-03-18,,Control,Control,Control,...,1.0,1.0,1.0,0.0,0.0,Control,0,train,2826.0,train
2,1000032,Control,,,NaT,2008-04-10,,Control,Control,Control,...,0.0,1.0,0.0,0.0,0.0,Control,0,train,,test
3,1000044,Control,,,NaT,2008-09-13,,Control,Control,Control,...,0.0,2.0,1.0,1.0,0.0,Control,0,train,438.0,train
4,1000058,Control,,,NaT,2009-02-05,,Control,Control,Control,...,0.0,2.0,0.0,1.0,0.0,Control,0,train,,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502414,6024155,Control,,,NaT,2008-04-11,,Control,Control,Control,...,1.0,0.0,0.0,0.0,0.0,Control,0,test,,test
502415,6024163,Incident,GP,61.0,2016-04-25,2009-10-03,6.55989,,,Glaucoma,...,1.0,1.0,0.0,0.0,0.0,Glaucoma,1,train,1935.0,train
502416,6024172,Control,,,NaT,2010-02-10,,Control,Control,Control,...,0.0,2.0,0.0,0.0,0.0,Control,1,train,,train
502417,6024181,Control,,,NaT,2008-09-08,,Control,Control,Control,...,1.0,1.0,1.0,0.0,0.0,Control,0,train,682.0,train


In [22]:
def stratify_split(df, mask_dict, test_proportion, split_col_name):
    for name, mask in mask_dict.items():
        subgroup_df = df[mask]
        subgroup_test = subgroup_df.sample(frac=test_proportion, replace=False, random_state=2024)
        df.loc[subgroup_test.index.values, split_col_name] = 'test'
        df.loc[subgroup_df.index.difference(subgroup_test.index.values), split_col_name] = 'train'


def qc_split(df, mask_dict, split_col_name):
    test_proportion = (df[split_col_name] == 'test').sum() / len(df)
    print(f'Split col NA: {df[split_col_name].isna().sum()}')
    print(f'Test set proportion: {test_proportion}')
    print(f'Split col value counts: {df[split_col_name].value_counts()}')
    for name, mask in mask_dict.items():
        test_proportion = (df[mask][split_col_name] == 'test').sum() / len(df[mask])
        print(f'\n {name}')
        print(f'Test set proportion: {test_proportion}')
        print(df[mask][split_col_name].value_counts())


# IOP available subcohort = IOP data available & not excluded as a control
IOP_available_mask = merged_df['IOP_available'] == 1
not_excluded_mask = merged_df['Exclusion'] == 0

IOP_subcohort_mask = IOP_available_mask & not_excluded_mask

merged_df['IOP subcohort'] = 0
merged_df.loc[IOP_subcohort_mask, 'IOP subcohort'] = 1

'''
### 90/10 split ### 3 year tte #################################################
split_col = 'tte_3year_ttsplit'
merged_df[split_col] = np.nan
glaucoma_mask = merged_df['tte_3year'] == 'Glaucoma'
control_mask = merged_df['tte_3year'] == 'Control'

# IOP subcohort
mask_dict = {
    'Glaucoma': IOP_subcohort_mask & glaucoma_mask,
    'Control': IOP_subcohort_mask & control_mask, }

stratify_split(
    df=merged_df,
    mask_dict=mask_dict,
    test_proportion=0.1,
    split_col_name=split_col )
qc_split(merged_df, mask_dict, split_col)

# Remaining study
mask_dict = {
    'Glaucoma': ~IOP_subcohort_mask & glaucoma_mask,
    'Control': ~IOP_subcohort_mask & control_mask, }

stratify_split(
    df=merged_df,
    mask_dict=mask_dict,
    test_proportion=0.1,
    split_col_name=split_col )

qc_split(merged_df, mask_dict, split_col)

'''


### 90/10 split ### 5 year tte #################################################
split_col = 'tte_5year_ttsplit'
merged_df[split_col] = np.nan
glaucoma_mask = merged_df['tte_5year'] == 'Glaucoma'
control_mask = merged_df['tte_5year'] == 'Control'

# IOP subcohort
mask_dict = {
    'Glaucoma': IOP_subcohort_mask & glaucoma_mask,
    'Control': IOP_subcohort_mask & control_mask, }

stratify_split(
    df=merged_df,
    mask_dict=mask_dict,
    test_proportion=0.1,
    split_col_name=split_col )
qc_split(merged_df, mask_dict, split_col)

# Remaining study
mask_dict = {
    'Glaucoma': ~IOP_subcohort_mask & glaucoma_mask,
    'Control': ~IOP_subcohort_mask & control_mask, }

stratify_split(
    df=merged_df,
    mask_dict=mask_dict,
    test_proportion=0.1,
    split_col_name=split_col )

qc_split(merged_df, mask_dict, split_col)





### 90/10 split ### 10 year tte #################################################
split_col = 'tte_10year_ttsplit'
merged_df[split_col] = np.nan
glaucoma_mask = merged_df['tte_10year'] == 'Glaucoma'
control_mask = merged_df['tte_10year'] == 'Control'

# IOP subcohort
mask_dict = {
    'Glaucoma': IOP_subcohort_mask & glaucoma_mask,
    'Control': IOP_subcohort_mask & control_mask, }

stratify_split(
    df=merged_df,
    mask_dict=mask_dict,
    test_proportion=0.1,
    split_col_name=split_col )
qc_split(merged_df, mask_dict, split_col)

# Remaining study
mask_dict = {
    'Glaucoma': ~IOP_subcohort_mask & glaucoma_mask,
    'Control': ~IOP_subcohort_mask & control_mask, }

stratify_split(
    df=merged_df,
    mask_dict=mask_dict,
    test_proportion=0.1,
    split_col_name=split_col )

qc_split(merged_df, mask_dict, split_col)

  df.loc[subgroup_test.index.values, split_col_name] = 'test'


Split col NA: 393514
Test set proportion: 0.021675135693514775
Split col value counts: tte_5year_ttsplit
train    98015
test     10890
Name: count, dtype: int64

 Glaucoma
Test set proportion: 0.09915611814345991
tte_5year_ttsplit
train    427
test      47
Name: count, dtype: int64

 Control
Test set proportion: 0.09999907775451669
tte_5year_ttsplit
train    97588
test     10843
Name: count, dtype: int64
Split col NA: 14271
Test set proportion: 0.09715794983868047
Split col value counts: tte_5year_ttsplit
train    439334
test      48814
Name: count, dtype: int64

 Glaucoma
Test set proportion: 0.09989484752891693
tte_5year_ttsplit
train    1712
test      190
Name: count, dtype: int64

 Control
Test set proportion: 0.09999973498771668
tte_5year_ttsplit
train    339607
test      37734
Name: count, dtype: int64


  df.loc[subgroup_test.index.values, split_col_name] = 'test'


Split col NA: 392775
Test set proportion: 0.021822423116960147
Split col value counts: tte_10year_ttsplit
train    98680
test     10964
Name: count, dtype: int64

 Glaucoma
Test set proportion: 0.09975267930750206
tte_10year_ttsplit
train    1092
test      121
Name: count, dtype: int64

 Control
Test set proportion: 0.09999907775451669
tte_10year_ttsplit
train    97588
test     10843
Name: count, dtype: int64
Split col NA: 10829
Test set proportion: 0.09784263732064273
Split col value counts: tte_10year_ttsplit
train    442432
test      49158
Name: count, dtype: int64

 Glaucoma
Test set proportion: 0.0998914223669924
tte_10year_ttsplit
train    4145
test      460
Name: count, dtype: int64

 Control
Test set proportion: 0.09999973498771668
tte_10year_ttsplit
train    339607
test      37734
Name: count, dtype: int64


In [11]:
merged_df['tte_3year'].value_counts()

NameError: name 'merged_df' is not defined

In [24]:
merged_df.to_csv('incidence_merged_df.csv', index=False)