In [None]:
#| echo: false
# Packages and working directories
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from pygam import LinearGAM, s, f
wd = '/Volumes/PEDS/RI Biostatistics Core/Shared/Shared Projects/Laura/BDC/Projects/Viral Shah/GMI and A1c/'

# Data Cleaning

In [None]:
# Import Kaan's data (use clean_cgms.py to generate mean glucose file)
# Had to manually edit some names in glucose file to match demographics
kaan = pd.read_csv(wd + 'Data_Clean/kaan_mean_glucose.csv',usecols=['ID','HbA1c','14 Overall Mean'])
kaan.columns = ['ID','HbA1c','Mean Glucose']
kaan['Study'] = 'KAAN'
# CGMND
cgmnd_a1c = pd.read_csv(wd+'Data_Raw/CGMND-af920dee-2d6e-4436-bc89-7a7b51239837/NonDiabSampleResults.csv')
cgmnd_a1c = cgmnd_a1c[cgmnd_a1c['Analyte'] == "HBA1C"]
cgmnd_a1c = cgmnd_a1c[['PtID','Value']]
cgmnd_a1c.columns = ['ID','HbA1c']
# Calculate mean glucose for each person in CGMND
cgmnd_cgm = pd.read_csv(wd+'Data_Raw/CGMND-af920dee-2d6e-4436-bc89-7a7b51239837/NonDiabDeviceCGM.csv')
ids = cgmnd_cgm['PtID'].unique()
msgs = {'ID':[],'Mean Glucose':[]}
for i in ids:
    d = cgmnd_cgm[(cgmnd_cgm['PtID'] == i) & (cgmnd_cgm['RecordType'] == "CGM")]
    msgs['ID'].append(i)
    msgs['Mean Glucose'].append(d['Value'].mean())
cgmnd_cgm = pd.DataFrame(msgs)
# Merge
cgmnd = pd.merge(cgmnd_a1c,cgmnd_cgm,on='ID')
cgmnd['Study'] = 'CGMND'
# FLAIR
flair = pd.read_csv(wd+'Data_Raw/gmiDataFlair 042222 .csv',usecols=['DeidentID','a1cLab','gluMean'])
flair.columns = ['ID','HbA1c','Mean Glucose']
flair['Study'] = 'FLAIR'
# MOBILE
mobile = pd.read_csv(wd+'Data_Raw/gmiDataMobile 031722 .csv',usecols=['DeidentID','a1cLab','gluMean'])
mobile.columns = ['ID','HbA1c','Mean Glucose']
mobile['Study'] = 'MOBILE'
# DIAMOND, etc
diamond = pd.read_csv(wd+'Data_Raw/gmiData 020222 .csv',usecols=['DeidentID','a1cLab','gluMean'])
diamond.columns = ['ID','HbA1c','Mean Glucose']
diamond['Study'] = 'DIAMOND'
# CITY
city = pd.read_csv(wd+'Data_Raw/CITYPublicDataset-344bea7d-8085-4deb-8038-6cb747a744e3/Data Tables/gluIndices Ext.txt',sep='|')
city = city[city['time'] == '1) Overall']
city['Visit'] = [v.lower() for v in city['Visit']]
city_a1c = pd.read_csv(wd+'Data_Raw/CITYPublicDataset-344bea7d-8085-4deb-8038-6cb747a744e3/Data Tables/vwCITY_STASampleResults.txt',sep='|',encoding='utf-16')
city_a1c = city_a1c[city_a1c['ResultName'] == 'GLYHB']
city_a1c['Visit'] = [v.lower() for v in city_a1c['Visit']]
city = pd.merge(city_a1c,city,on=['PtID','Visit'])
city = city[['PtID','Value','gluMean']]
city.columns = ['ID','HbA1c','Mean Glucose']
city['Study'] = 'CITY'
# DCLP3
# Per Peter, okay to use the 26 week HbA1c for post-randomization
dclp3_cgm = pd.read_csv(wd+'Data_Raw/DCLP3 Public Dataset - Release 2 - 2022-01-18-9fc308ee-5d12-4651-8aec-262576777a31/Data Files/gluIndices.txt',sep='|')
dclp3_cgm = dclp3_cgm[dclp3_cgm['analysis'] == '3. first 3mo']
dclp3_cgm = dclp3_cgm[dclp3_cgm['period'] == '2. Post Randomization']
dclp3_cgm = dclp3_cgm[['PtID','gluMean']]
dclp3_a1c = pd.read_csv(wd+'Data_Raw/DCLP3 Public Dataset - Release 2 - 2022-01-18-9fc308ee-5d12-4651-8aec-262576777a31/Data Files/SampleResults_a.txt',sep='|')
dclp3_a1c = dclp3_a1c[dclp3_a1c['ResultName'] == 'GLYHB']
dclp3_a1c = dclp3_a1c[dclp3_a1c['Visit'] == '13 Week']
dclp3_a1c = dclp3_a1c[['PtID','Value']]
# Merge and clean up
dclp3 = pd.merge(dclp3_cgm,dclp3_a1c,on='PtID')
dclp3 = dclp3[['PtID','Value','gluMean']]
dclp3.columns = ['ID','HbA1c','Mean Glucose']
dclp3['Study'] = 'DCLP3'
# WISDM
wisdm = pd.read_csv(wd+'/Data_Raw/WISDMPublicDataset-18f24ae5-b4fb-4e93-bec6-7021086419fa/Data Tables/gluIndices Ext.txt',sep='|')
wisdm = wisdm[wisdm['time'] == '1) Overall']
wisdm_a1c = pd.read_csv(wd+'Data_Raw/WISDMPublicDataset-18f24ae5-b4fb-4e93-bec6-7021086419fa/Data Tables/STASampleResults.txt',sep='|',encoding='utf-16')
wisdm_a1c = wisdm_a1c[wisdm_a1c['ResultName'] == 'GLYHB']
wisdm = pd.merge(wisdm_a1c,wisdm,left_on=['PtID','Visit'],right_on=['PtID','visit'])
wisdm = wisdm[['PtID','Value','gluMean']]
wisdm.columns = ['ID','HbA1c','Mean Glucose']
wisdm['Study'] = 'WISDM'
# Combine everything
gmi_data = pd.concat([kaan,cgmnd,flair,mobile,diamond,city,dclp3,wisdm])
# As numeric
gmi_data['HbA1c'] = [float(n) for n in gmi_data['HbA1c']]
gmi_data['Mean Glucose'] = [float(n) for n in gmi_data['Mean Glucose']]
# Drop missing
gmi_data.dropna(subset=['Mean Glucose','HbA1c'],inplace=True)
# Write for checking results in R
gmi_data.to_csv(wd+'/Data_Clean/analysis_dataset.csv',index=False)

- HbA1c and mean glucose data were pulled from Kaan's data, CGMND, FLAIR, MOBILE, CITY, DCLP3, WISDM, and original data from Peter including participants from DIAMOND, REPLACE, and HypoDE.
- DCLP3 includes CGM data for two time periods ("Baseline" and "Post-Randomization") but HbA1c for "Randomization", "13 Week", and "26 Week" visits. Per Peter, 26 week HbA1c was matched with post-randomization mean glucose.
- CITY contains multiple files with HbA1c values. The file "vwCITY_STASampleResults.txt" was used for this report.

In [None]:
#| label: fig-scatter
#| fig-cap: "HbA1c and Mean Sensor Glucose"
pio.renderers.default = "notebook"
line = px.line(x=gmi_data['Mean Glucose'], y=3.31 + 0.02392*gmi_data['Mean Glucose'])
scatter = px.scatter(gmi_data,x='Mean Glucose', y='HbA1c',
                 labels={
                     'Mean Glucose': 'Mean Glucose (mg/dL)',
                     'HbA1c': 'HbA1c (%)'
                 },)

fig = go.Figure(data=go.Scatter(x=gmi_data['Mean Glucose'], y=gmi_data['HbA1c'],mode='markers',showlegend = False,name = "HbA1c Data"))
# Create trace
fig.add_trace(go.Scatter(x=gmi_data['Mean Glucose'], y=3.31 + 0.02392*gmi_data['Mean Glucose'], name='GMI',
                         line=dict(color='red')))

fig.show();

# GAM
- A linear general additive model (GAM) was fit under the assumption that HbA1c is normally distributed (see @fig-a1chist).
- https://stats.stackexchange.com/questions/35893/how-do-i-test-a-nonlinear-association

In [None]:
#| label: fig-a1chist
#| fig-cap: "HbA1c Distribution"
gmi_data['HbA1c'].hist();

In [None]:
#| output: false
# X and Y matrices
X = np.array(gmi_data['Mean Glucose'])
y = np.array(gmi_data['HbA1c'])
# Fit linear GAM with splines for mean glucose
gam = LinearGAM(s(0,penalties='none'));
gam.gridsearch(X[:,None],y,keep_best=True); # For some reason grid search needs two columns in X, so make one empty
gam.summary()

In [None]:
# Data for plotting
XX = gam.generate_X_grid(term=0, n=500)
yy = gam.predict(X);
# Plot
fig = go.Figure(data=go.Scatter(x=gmi_data['Mean Glucose'], y=gmi_data['HbA1c'],mode='markers',showlegend = False,name = "HbA1c Data"))
fig.add_trace(go.Scatter(x=XX[:,0], y=yy, name='FDA',line=dict(color='red')))
fig.show();

In [None]:
import statsmodels.api as sm
from statsmodels.gam.api import GLMGam, BSplines
bs = BSplines(gmi_data['Mean Glucose'], df=20, degree=3)
gam_bs = GLMGam.from_formula('HbA1c ~ 1',smoother=bs,data = gmi_data,alpha = 4407.50829403) # alpha from gam_bs.select_penweight
res_bs = gam_bs.fit()
res_bs.summary()

In [None]:
gam_bs.select_penweight()[0]

In [None]:
from statsmodels.gam.tests.test_penalized import df_autos

In [None]:
df_autos.to_csv('~/temp.csv')

In [None]:
help(GLMGam)