In [3]:
# Turn off annoying ipykernel warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tableone import TableOne
from sklearn import linear_model
from patsy import dmatrices
import statsmodels.api as sm
import statsmodels.formula.api as smf

# New Onset Dosing in T1D

In [9]:
# Data import and cleaning
wd = '/Volumes/som/PEDS/RI Biostatistics Core/Shared/Shared Projects/Laura/BDC/Projects/Kimber Simmons/New Onset Dosing/Data_Cleaned/'
df = pd.read_csv(wd+'new_onset_doses.csv',na_values='.',skipinitialspace = True)
df.set_index(['MRN'],inplace=True)
# Remove T2D
df = df[df['Diabetes Type']=="T1D"]
# Format columns
# Convert all ages to months
df['Age'] = [int(a.split()[0])*12 if 'year' in a else int(a.split()[0]) for a in df['Age']]
# Categorize bicarb
bicarb_cat=[]
for b in df['bicarb']:
    if '<' in str(b):
        bicarb_cat.append('Severe')
    elif float(b) >= 5 and float(b) < 10:
        bicarb_cat.append('Moderate')
    elif float(b) >= 10:
        bicarb_cat.append('Mild')
    else:
        bicarb_cat.append(np.nan)
df['bicarb_cat'] = bicarb_cat
# Replace detection limit of bicarb with half the value
df['bicarb'] = [2.5 if '<' in str(b) and '5' in str(b) else 1.5 if '<' in str(b) and '3' in str(b) else float(b) for b in df['bicarb']]
# Convert to numeric
df['week1_carbbreakfast']=pd.to_numeric(df['week1_carbbreakfast'],errors='coerce')
df['week1_carblunch']=pd.to_numeric(df['week1_carblunch'],errors='coerce')
df['week1_carbdinner']=pd.to_numeric(df['week1_carbdinner'],errors='coerce')
# Combine levels of race
race = []
for r in df['Race']:
    if '\n' in str(r) or 'more than one' in str(r).lower():
        race.append('Multiracial')
    elif 'white' in str(r).lower():
        race.append('White')
    elif 'asian' in str(r).lower():
        race.append('Asian')
    elif 'black' in str(r).lower():
        race.append('Black/African American')
    elif 'native' in str(r).lower():
        race.append('Native American/Pacific Islander') 
    else: 
        race.append('Unknown/Not Reported')
for e in range(0,df.shape[0]):
    if df['Ethnicity'].reset_index(drop=True).iloc[e] == 'Hispanic or Latino [1]' and race[e] != 'Multiracial':
        race[e] = 'Hispanic'
df['Race'] = race
# Public vs. private insurance
# Public – Medicaid, TriCare, CHP, Indian Health Service, Denver Health (Medicaid)
# Private – Aetna, Anthem, Cigna, UHC, Kaiser, Cofinity, Christian Care
# Self-pay
public = ['medicaid','tricare','chp','indian','denver health']
private = ['aetna','anthem','cigna','uhc','kaiser','cofinity',
           'christian','commercial','friday','samaritan','rocky mountain','liberty','assurant']
insurance = []
for i in df['Insurance']:
    ins = str(i).lower()
    if 'Private' in ['Private' for p in private if p in ins]:
        insurance.append('Private')
    elif 'Public' in ['Public' for p in public if p in ins]: 
        insurance.append('Public')
    elif 'self' in ins:
        insurance.append('Self-pay')
    else: 
        insurance.append('Unknown/Not Reported')
df['Insurance'] = insurance

## Table 1: Participant Characteristics at Visit 1

In [15]:
# Make table 1
cols = ['Age','Sex','puberty_yn','Race','Insurance',
       'Initial_A1c','Hospitalization','DKA','pH','bicarb','bicarb_cat','Diabetes Type','bmi_onset','Wt_onset']
# Print
t1 = TableOne(df,columns=cols,display_all=True)
t1

Unnamed: 0,Unnamed: 1,Missing,Overall
n,,,336
"Age, mean (SD)",,0.0,117.4 (52.7)
"Sex, n (%)",Female,0.0,157 (46.7)
"Sex, n (%)",Male,,179 (53.3)
"puberty_yn, n (%)",no,0.0,182 (54.2)
"puberty_yn, n (%)",yes,,154 (45.8)
"Race, n (%)",Asian,0.0,4 (1.2)
"Race, n (%)",Black/African American,,12 (3.6)
"Race, n (%)",Hispanic,,45 (13.4)
"Race, n (%)",Multiracial,,17 (5.1)


# Methods
All analyses were performed using Python version 3.9.5. Model selection was based on 10-fold cross validation with elastic net as implemented in the scikit-learn package, version 0.24.2. Model selection was performed on variables: Age, Sex, Race/Ethnicity, Initial_A1c, DKA, and Wt_onset with Average_TDD as the outcome. Bicarb was categorized as a  into severe (< 5), moderate (5 - 10), and mild (> 10). Race and Ethnicity were combined per Kimber's 6/22 email.

The elasticnet model selected the Age, Initial_A1c, and Wt_onset variables. Next, a linear model was re-fit using the statsmodels library (version 0.12.2). A linear mixed model with random effect for provider produced convergence warnings and was not much better than a simple linear model. 

## Model Selection with ElasticNet

In [17]:
# Get model matrices
y,X = dmatrices('Average_TDD ~ Age*Sex+Age*Race+Age*Initial_A1c+Age*DKA+Age*Wt_onset',data = df)
# EN cross validation
regr = linear_model.ElasticNetCV(cv=10, random_state=1017)
cv = regr.fit(X, y.ravel())
# Print terms where coefficient is not 0
[X.design_info.column_names[i] for i in range(0,cv.coef_.shape[0]) if cv.coef_[i] != 0]

['Age:Sex[T.Male]',
 'Age:Race[T.Black/African American]',
 'Age:Race[T.Hispanic]',
 'Age:Race[T.Unknown/Not Reported]',
 'Age:DKA[T.Yes]',
 'Age:Initial_A1c',
 'Age:Wt_onset']

In [28]:
# Fit simple model
md = smf.ols("Average_TDD ~ Age+Initial_A1c+Wt_onset+Sex+DKA",data=df,missing='drop')
mdf = md.fit()
print(mdf.summary())

                            OLS Regression Results                            
Dep. Variable:            Average_TDD   R-squared:                       0.308
Model:                            OLS   Adj. R-squared:                  0.295
Method:                 Least Squares   F-statistic:                     23.98
Date:                Wed, 23 Jun 2021   Prob (F-statistic):           5.96e-20
Time:                        17:35:30   Log-Likelihood:                -1120.5
No. Observations:                 276   AIC:                             2253.
Df Residuals:                     270   BIC:                             2275.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept       4.6998      4.569      1.029      

The output above is interpreted the same way as simple multiple regression (i.e. without random effects), with the "Coef." column representing the beta values. So, for each one unit increase in age (in months), Average_TDD increases by 0.091 on average while holding the other variables constant. These coefficients can be multiplied and keep the same interpretation, so for each 1 year increase in age, Average_TDD increases by 0.093 * 12 = 1.116 on average.

Model selection using regularization and cross-validation may retain some variables even though they are not statistically significant. These variables may not have a significant p value, but they help increase the model's predictive value.

## BMI at onset

In [20]:
md = smf.ols("Average_TDD ~ bmi_onset",data=df,missing='drop')
mdf = md.fit()
print(mdf.summary())

                            OLS Regression Results                            
Dep. Variable:            Average_TDD   R-squared:                       0.104
Model:                            OLS   Adj. R-squared:                  0.101
Method:                 Least Squares   F-statistic:                     31.85
Date:                Wed, 23 Jun 2021   Prob (F-statistic):           4.13e-08
Time:                        17:30:46   Log-Likelihood:                -1162.7
No. Observations:                 277   AIC:                             2329.
Df Residuals:                     275   BIC:                             2337.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     12.4627      4.325      2.881      0.0

## Insurance

In [21]:
md = smf.ols("Average_TDD ~ Insurance",data=df,missing='drop')
mdf = md.fit()
print(mdf.summary())

                            OLS Regression Results                            
Dep. Variable:            Average_TDD   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                 -0.008
Method:                 Least Squares   F-statistic:                    0.2149
Date:                Wed, 23 Jun 2021   Prob (F-statistic):              0.886
Time:                        17:30:48   Log-Likelihood:                -1253.1
No. Observations:                 295   AIC:                             2514.
Df Residuals:                     291   BIC:                             2529.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
Interc

In [19]:
df

Unnamed: 0_level_0,Patient_Name,Age,Sex,puberty_yn,Race,Ethnicity,Insurance,Date_diagnosis,Initial_A1c,Hospitalization,...,wt_1mo,BMI_1mo,TDD_correction1,TDD_correction2,TDD_carbbreakfast,TDD_carblunch,TDD_carbdinner,Average_TDD,TDD_kg,bicarb_cat
MRN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2240959,"Jones, Lou Ellen",24,Female,no,Multiracial,Not Hispanic or Latino [2],Private,12/27/19,10.0,Yes,...,9.5,18.12,10.799784,8.0,12.500000,12.500000,12.500000,11.259957,1.185259,Severe
1948268,"Abdillahi, Afnaan",36,Female,no,Black/African American,Not Hispanic or Latino [2],Private,12/5/19,10.8,Yes,...,16.7,16.01,8.000000,10.0,10.000000,20.833333,8.333333,11.433333,0.684631,Moderate
1600154,"Johnson, Cole",72,Male,no,White,Not Hispanic or Latino [2],Private,2/5/19,11.9,Yes,...,,,18.000000,12.0,16.666667,16.666667,12.500000,15.166667,,Mild
1395523,"Janes, Noelle I",96,Female,no,White,Not Hispanic or Latino [2],Private,2/6/19,12.1,Yes,...,27.1,17.14,18.000000,12.0,16.666667,16.666667,25.000000,17.666667,0.651907,Mild
2153166,"Adams, Kaden",48,Male,no,White,Not Hispanic or Latino [2],Private,2/13/19,11.4,No,...,16.5,15.65,12.000000,12.0,10.000000,10.000000,16.666667,12.133333,0.735354,Mild
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1581860,"Paul, Zion Aqique",84,Male,no,Unknown/Not Reported,Not Hispanic or Latino [2],Public,10/8/19,9.1,Yes,...,32.2,18.34,18.000000,,16.666667,16.666667,16.666667,17.000000,0.527950,Mild
1883313,"Reece, Stella",36,Female,no,White,Not Hispanic or Latino [2],Unknown/Not Reported,10/8/19,9.4,Yes,...,14.9,15.20,14.400000,,25.000000,25.000000,16.666667,20.266667,1.360179,Mild
2218098,"Barrera, Gemma",108,Female,no,White,Not Hispanic or Latino [2],Public,10/9/19,15.0,Yes,...,,,36.000000,,62.500000,62.500000,62.500000,55.875000,,Mild
1370356,"Johnson, Michael",156,Male,yes,Unknown/Not Reported,Unknown [3],Public,10/14/19,10.9,Yes,...,58.0,,36.000000,,33.333333,33.333333,33.333333,34.000000,0.586207,Mild
