In [1]:
# Turn off annoying ipykernel warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tableone import TableOne
from sklearn import linear_model
from patsy import dmatrices
import statsmodels.api as sm
import statsmodels.formula.api as smf

  from collections import Mapping


# New Onset Dosing in T1D

In [3]:
# Data import and cleaning
wd = '/Volumes/som/PEDS/RI Biostatistics Core/Shared/Shared Projects/Laura/BDC/Projects/Kimber Simmons/New Onset Dosing/Data_Cleaned/'
df = pd.read_csv(wd+'new_onset_doses.csv',na_values='.',skipinitialspace = True)
df.set_index(['MRN'],inplace=True)
# Remove T2D
df = df[df['Diabetes Type']=="T1D"]
# Format columns
# Convert all ages to months
df['Age'] = [int(a.split()[0])*12 if 'year' in a else int(a.split()[0]) for a in df['Age']]
# Categorize bicarb
bicarb_cat=[]
for b in df['bicarb']:
    if '<' in str(b):
        bicarb_cat.append('Severe')
    elif float(b) >= 5 and float(b) < 10:
        bicarb_cat.append('Moderate')
    elif float(b) >= 10:
        bicarb_cat.append('Mild')
    else:
        bicarb_cat.append(np.nan)
df['bicarb_cat'] = bicarb_cat
# Replace detection limit of bicarb with half the value
df['bicarb'] = [2.5 if '<' in str(b) and '5' in str(b) else 1.5 if '<' in str(b) and '3' in str(b) else float(b) for b in df['bicarb']]
# Convert to numeric
df['week1_carbbreakfast']=pd.to_numeric(df['week1_carbbreakfast'],errors='coerce')
df['week1_carblunch']=pd.to_numeric(df['week1_carblunch'],errors='coerce')
df['week1_carbdinner']=pd.to_numeric(df['week1_carbdinner'],errors='coerce')
# Combine levels of race
race = []
for r in df['Race']:
    if '\n' in str(r) or 'more than one' in str(r).lower():
        race.append('Multiracial')
    elif 'white' in str(r).lower():
        race.append('White')
    elif 'asian' in str(r).lower():
        race.append('Asian')
    elif 'black' in str(r).lower():
        race.append('Black/African American')
    elif 'native' in str(r).lower():
        race.append('Native American/Pacific Islander') 
    else: 
        race.append('Unknown/Not Reported')
for e in range(0,df.shape[0]):
    if df['Ethnicity'].reset_index(drop=True).iloc[e] == 'Hispanic or Latino [1]' and race[e] != 'Multiracial':
        race[e] = 'Hispanic'
df['Race'] = race
# Public vs. private insurance
# Public – Medicaid, TriCare, CHP, Indian Health Service, Denver Health (Medicaid)
# Private – Aetna, Anthem, Cigna, UHC, Kaiser, Cofinity, Christian Care
# Self-pay
public = ['medicaid','tricare','chp','indian','denver health']
private = ['aetna','anthem','cigna','uhc','kaiser','cofinity',
           'christian','commercial','friday','samaritan','rocky mountain','liberty','assurant']
insurance = []
for i in df['Insurance']:
    ins = str(i).lower()
    if 'Private' in ['Private' for p in private if p in ins]:
        insurance.append('Private')
    elif 'Public' in ['Public' for p in public if p in ins]: 
        insurance.append('Public')
    elif 'self' in ins:
        insurance.append('Self-pay')
    else: 
        insurance.append('Unknown/Not Reported')
df['Insurance'] = insurance
# Write to CSV
df.to_csv(wd+"cleaned_data.csv")

## Table 1: Participant Characteristics at Visit 1

In [4]:
# Make table 1
cols = ['Age','Sex','puberty_yn','Race','Insurance',
       'Initial_A1c','Hospitalization','DKA','pH','bicarb','bicarb_cat','Diabetes Type','bmi_onset','Wt_onset']
# Print
t1 = TableOne(df,columns=cols,display_all=True)
t1

Unnamed: 0,Unnamed: 1,Missing,Overall
n,,,336
"Age, mean (SD)",,0.0,117.4 (52.7)
"Sex, n (%)",Female,0.0,157 (46.7)
"Sex, n (%)",Male,,179 (53.3)
"puberty_yn, n (%)",no,0.0,182 (54.2)
"puberty_yn, n (%)",yes,,154 (45.8)
"Race, n (%)",Asian,0.0,4 (1.2)
"Race, n (%)",Black/African American,,12 (3.6)
"Race, n (%)",Hispanic,,45 (13.4)
"Race, n (%)",Multiracial,,17 (5.1)


# Methods
All analyses were performed using Python version 3.9.5 and R version 4.1.0. Model selection was based on 10-fold cross validation with hierarchical group-lasso regularization as implemented in the glinternet R package, version 1.0.11. Model selection was performed on variables: Age, Sex, Race/Ethnicity, Initial_A1c, DKA, and Wt_onset and all interactions with Average_TDD as the outcome. Bicarb was categorized as a  into severe (< 5), moderate (5 - 10), and mild (> 10). Race and Ethnicity were combined per Kimber's 6/22 email.

Next, a linear model was re-fit using the statsmodels library (version 0.12.2). A linear mixed model with random effect for provider produced convergence warnings and was not much better than a simple linear model. 

## Model Selection with ElasticNet

In [12]:
# Get model matrices
y,X = dmatrices('Average_TDD ~ Age+Sex+Race+Initial_A1c+DKA+Wt_onset',data = df)
# EN cross validation
regr = linear_model.ElasticNetCV(cv=10, random_state=1017)
cv = regr.fit(X, y.ravel())
# Print terms where coefficient is not 0
[X.design_info.column_names[i] for i in range(0,cv.coef_.shape[0]) if cv.coef_[i] != 0]
# For testing interaction terms with the Lasso, use the glinternet R package

['Sex[T.Male]',
 'Race[T.Hispanic]',
 'Race[T.Unknown/Not Reported]',
 'DKA[T.Yes]',
 'Age',
 'Initial_A1c',
 'Wt_onset']

In [6]:
# Fit simple model
md = smf.ols("Average_TDD ~ Age+Initial_A1c*Wt_onset+Sex*DKA",data=df,missing='drop')
mdf = md.fit()
print(mdf.summary())

                            OLS Regression Results                            
Dep. Variable:            Average_TDD   R-squared:                       0.366
Model:                            OLS   Adj. R-squared:                  0.349
Method:                 Least Squares   F-statistic:                     22.09
Date:                Thu, 22 Jul 2021   Prob (F-statistic):           1.66e-23
Time:                        09:13:14   Log-Likelihood:                -1108.3
No. Observations:                 276   AIC:                             2233.
Df Residuals:                     268   BIC:                             2262.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept                 41

The output above is interpreted the same way as simple multiple regression (i.e. without random effects), with the "Coef." column representing the beta values. So, for each one unit increase in age (in months), Average_TDD increases by 0.091 on average while holding the other variables constant. These coefficients can be multiplied and keep the same interpretation, so for each 1 year increase in age, Average_TDD increases by 0.093 * 12 = 1.116 on average.

Model selection using regularization and cross-validation may retain some variables even though they are not statistically significant. These variables may not have a significant p value, but they help increase the model's predictive value.

## BMI at onset

In [7]:
md = smf.ols("Average_TDD ~ bmi_onset",data=df,missing='drop')
mdf = md.fit()
print(mdf.summary())

                            OLS Regression Results                            
Dep. Variable:            Average_TDD   R-squared:                       0.104
Model:                            OLS   Adj. R-squared:                  0.101
Method:                 Least Squares   F-statistic:                     31.85
Date:                Thu, 22 Jul 2021   Prob (F-statistic):           4.13e-08
Time:                        09:13:14   Log-Likelihood:                -1162.7
No. Observations:                 277   AIC:                             2329.
Df Residuals:                     275   BIC:                             2337.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     12.4627      4.325      2.881      0.0

## Insurance

In [8]:
md = smf.ols("Average_TDD ~ Insurance",data=df,missing='drop')
mdf = md.fit()
print(mdf.summary())

                            OLS Regression Results                            
Dep. Variable:            Average_TDD   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                 -0.008
Method:                 Least Squares   F-statistic:                    0.2149
Date:                Thu, 22 Jul 2021   Prob (F-statistic):              0.886
Time:                        09:13:14   Log-Likelihood:                -1253.1
No. Observations:                 295   AIC:                             2514.
Df Residuals:                     291   BIC:                             2529.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
Interc

# Correction Equations

## Methods

In [9]:
df['week1_correction1']

MRN
2240959           166.67
1948268              225
1600154              100
1395523              100
2153166              150
2153011               75
1933564              100
1894619              100
2166168              100
1954598              150
2180550              100
2124595              150
1693775              100
2178432              100
2213482              150
2234432              150
2147871           166.67
2163021              100
2214815              150
1925230               75
2220683               40
2074572               40
1811611              150
1078574               50
1393230               50
1242100              150
2164032              150
2141859               60
2148868              100
2152779               50
1331445              100
2153853              100
2158965              100
2164270              100
2174680              100
2178341               50
1508019              100
1906508              100
1365112               50
1987934              

In [10]:
help(np.random.choice)

Help on built-in function choice:

choice(...) method of numpy.random.mtrand.RandomState instance
    choice(a, size=None, replace=True, p=None)
    
    Generates a random sample from a given 1-D array
    
    .. versionadded:: 1.7.0
    
    .. note::
        New code should use the ``choice`` method of a ``default_rng()``
        instance instead; please see the :ref:`random-quick-start`.
    
    Parameters
    ----------
    a : 1-D array-like or int
        If an ndarray, a random sample is generated from its elements.
        If an int, the random sample is generated as if it were ``np.arange(a)``
    size : int or tuple of ints, optional
        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
        ``m * n * k`` samples are drawn.  Default is None, in which case a
        single value is returned.
    replace : boolean, optional
        Whether the sample is with or without replacement. Default is True,
        meaning that a value of ``a`` can be selected mu