This notebook goes over the code needed to reproduce the results from Table 3, using WiSER with the Women's Health Study (WHS) accelerometry data. We also compare its use to fitting a linear mixed effects model via MixedModels.jl.

## Availability & Description

Due to confidentiality concerns, access to the WHS Accelerometry dataset is only available through the National Institutes of Health (NIH) database of Genotypes and Phenotypes (dbGaP). Researchers can apply for acceess to download this dataset through dbGaP. 

The URL for the webpage is https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs001964.v1.p1dbGaP and the dbGaP Study Accession identifier is phs001964.v1.p1. This page includes a description of the dataset, study, and details on how to request access to the data. We cannot give more details on the data due to dbGaP's data use agreement. 


This notebook goes over code, that when used with the dbGAP's WHS Accelerometry data, can reproduce results in the paper (Table 3).

In [None]:
versioninfo()

Julia Version 1.5.0
Commit 96786e22cc (2020-08-01 23:44 UTC)
Platform Info:
  OS: Linux (x86_64-pc-linux-gnu)
  CPU: Intel(R) Core(TM) i9-9920X CPU @ 3.50GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-9.0.1 (ORCJIT, skylake)


## Data Cleaning

In [None]:
using DataFrames, CSV, StatsBase, Statistics, CodecZlib, Dates
ENV["COLUMNS"]=1000

Import the data and add some variables.

In [None]:
pathofdata = "WHS_Accelerometer_phs001964/PhenoGenotypeFiles/RootStudyConsentSet_phs001964.WHS_Accelerometer.v1.p1.c1.GRU/PhenotypeFiles/"

accelerometer_subject = open(pathofdata * "phs001964.v1.pht009959.v1.p1.WHS_Accelerometer_Subject.MULTI.txt.gz") do io
    DataFrame!(CSV.File(GzipDecompressorStream(io), delim="\t", comment = "#", ignoreemptylines=true))
    end

accelerometer_mins = open(pathofdata * "phs001964.v1.pht009963.v1.p1.c1.WHS_Accelerometer_60sec.GRU.txt.gz") do io
#     CSV.read(GzipDecompressorStream(io), delim="\t", comment = "#", ignoreemptylines=true)
    DataFrame!(CSV.File(GzipDecompressorStream(io), delim="\t", comment = "#", ignoreemptylines=true))
end

sum_data = open(pathofdata * "phs001964.v1.pht009960.v1.p1.c1.WHS_Accelerometer_d20180514_Pub.GRU.txt.gz") do io
#      CSV.read(GzipDecompressorStream(io), delim="\t", comment = "#", ignoreemptylines=true)
    DataFrame!(CSV.File(GzipDecompressorStream(io), delim="\t", comment = "#", ignoreemptylines=true))
end


smoking_pub = open(pathofdata * "phs001964.v1.pht009961.v1.p1.c1.WHS_Accelerometer_Smoking_Pub.GRU.txt.gz") do io
     DataFrame!(CSV.File(GzipDecompressorStream(io), delim="\t", comment = "#", ignoreemptylines=true))
end

# get hour of day 
accelerometer_mins[!, :hour] = Dates.hour.(accelerometer_mins[!, :timeHMS]) ;
# get minute of the day
accelerometer_mins[!, :mins] = Dates.minute.(accelerometer_mins[!, :timeHMS]);
# group every 5 minutes for each hour
accelerometer_mins[!, :mingroup] = floor.(accelerometer_mins[!, :mins] / 5) #every 5 minutes
accelerometer_mins[!, :season] = map(x -> ismissing(x) ? missing : x == 1 ? "winter" :
    x == 2 ? "spring" : x== 3 ?  "summer" : "autumn", accelerometer_mins[!, :season])
# create smoking variable
smoking_pub[!, :smoker] = map(x -> ismissing(x) ? missing : x == 1 ? "never" : x == 2 ? "past" : "current",
    smoking_pub[!, :smoke])
# create race variable from numeric definitions 
sum_data[!, :RACE] = map(x -> ismissing(x) ? missing : x == 1 ? "white" : x == 2 ? "hispanic" : 
    x == 3 ? "african american" : x == 4 ? "asian" : x == 5 ? "native american" : "other",
    sum_data[!, :RACE]);

# subset to just these variables

keepvars = [:dbGaP_Subject_ID
 :Subject_ID
 :wday
 :ep60_maxsteps
 :m_total
 :sum_valid
:EP60_total_steps
 :compliant
:RACE
:stairs
 :genhealth
 :bmi
 :ageaccel
 :day_worn
 :season]

# get data we need from the daily summary data
sum_data = sum_data[!, keepvars]

In [None]:
# summarize data by hour, to get steps and vector magnitude 
summarized_hour = combine(DataFrames.groupby(accelerometer_mins, 
        [:Subject_ID, :wday, :hour]), :steps => sum, :count_vm => sum)

In [None]:
#Get best top p hours 
function findtop_p(x, p)
    sortedx = sort(x, rev=true)
    return ind1 = findfirst(x .== sortedx[p]) - 1 #hours start at 1, subtract 1 to get true hour 
end

# get max hour for each person, each day 
maxhours = combine(DataFrames.groupby(summarized_hour, 
        [:Subject_ID, :wday]),
        :count_vm_sum => (x -> findtop_p(x, 1)) => :maxvm_hour1,
        :count_vm_sum => (x -> findtop_p(x, 2)) => :maxvm_hour2)

In [None]:
# get the minute data for the top hour found above for each person, each day
tophour_data = DataFrames.innerjoin(accelerometer_mins, maxhours[!, 1:3];
    on = [:Subject_ID, :wday, :hour] .=> [:Subject_ID, :wday, :maxvm_hour1],
    makeunique = false,
    validate = (false, false))

In [None]:
# get the minute data for the second highest hour found above for each person, each day
top2hour_data = DataFrames.innerjoin(accelerometer_mins, maxhours[!, [1;2;4]];
    on = [:Subject_ID, :wday, :hour] .=> [:Subject_ID, :wday, :maxvm_hour2],
    makeunique = false,
    validate = (false, false))

In [None]:
# combine top 1 hour and top 2 hour data together
top2hour_data = vcat(tophour_data, top2hour_data)

In [None]:
# sum steps over each 5 minutes to use as outcome variable
summarized_5min = combine(DataFrames.groupby(top2hour_data, 
        [:Subject_ID, :wday, :hour, :mingroup]),
    :day_worn => first => :day_worn, :season => first => :season, :steps => sum => :steps)

In [None]:
# add the daily summary data
top2hour_sumdatacomb = DataFrames.leftjoin(summarized_5min, sum_data; on = [:Subject_ID, :wday, :day_worn, :season], makeunique = false,
         indicator = nothing, validate = (false, false))

In [None]:
# add the smoking data
top2hourdata = DataFrames.leftjoin(top2hour_sumdatacomb, smoking_pub; on = [:Subject_ID, :dbGaP_Subject_ID], makeunique = false,
         indicator = nothing, validate = (false, false))

In [None]:
# Don't run twice 

top2hourdata[!, :steps] = Float64.(top2hourdata[!, :steps]);
top2hourdata[!, :smoker] = map(x -> ismissing(x) ? missing : x == 1 ? "Never" : x == 2 ? "Past" : "Current",
    top2hourdata[!, :smoke])
top2hourdata[!, :smoker] = levels!(CategoricalArray(top2hourdata[!, :smoker]),
    ["Never"; "Past"; "Current"]);

top2hourdata[!, :RACE] = map(x -> ismissing.(x) ? missing : titlecase(String(x)),
    top2hourdata[!, :RACE])
top2hourdata[!, :RACE] = levels!(CategoricalArray(top2hourdata[!, :RACE]),
    ["White"; "African American"; "Asian"; "Hispanic"; "Native American"; "Other"])

top2hourdata[!, :wday] = levels!(CategoricalArray(top2hourdata[!, :wday]),
    ["Sun"; "Mon"; "Tues"; "Wed"; "Thurs"; "Fri"; "Sat"])

top2hourdata[!, :season] = map(x -> ismissing(x) ? missing : x == 1 ? "Winter" :
    x == 2 ? "Spring" : x== 3 ?  "Summer" : "Autumn", top2hourdata[!, :season]);

top2hourdata[!, :Weekend] = map(x -> ismissing(x) ? missing :
    x in ["Mon"; "Tues"; "Wed"; "Thurs"; "Fri"] ? "Weekday" :
    "Weekend", top2hourdata[!, :wday]);

# make names more presentable to final names 
renamenames = ["Subject_ID"
 "Wday"
 "Hour"
 "mingroup"
 "Day_worn"
 "Season"
 "Steps"
 "dbGaP_Subject_ID"
 "ep60_maxsteps"
 "m_total"
 "sum_valid"
 "EP60_total_steps"
 "Compliant"
 "Race"
 "Stairs"
 "Genhealth"
 "BMI"
 "Age"
 "Smoke"
 "Smoker"
 "Weekend"] 
rename!(top2hourdata, renamenames)

keepvars = [:BMI; :Steps; :Wday; :Hour; :Race; :Age; :Smoker; :Season; :Day_worn; :m_total; :Stairs]

descrstats = dropmissing(top2hour_restricted, keepvars)

In [None]:
# Drop 0s and log10 transform steps 
keepinds = findall(top2hourdata[!, :Steps] .> 0.0)
top2hour_restricted = top2hourdata[keepinds, :]
top2hour_restricted[!, :Transformed_steps] = log10.(top2hour_restricted[!, :Steps]);
# optionally save this dataset 
CSV.write("WHS_final_cleaned.csv", top2hour_restricted)

## Analysis

As stated in the paper, we use the Knitro solver. If you do not have access to the knitro solver, you can remove solver and KNITRO and it will run, with slightly different but very similar results.

The following produce the results found in Table 3 of the paper. 

In [None]:
using DataFrames, CSV, WiSER, MixedModels, KNITRO
ENV["COLUMNS"]=1000 #extends the number of columns printed when displaying a dataframe. 

#load in data
WHSdata = DataFrame!(CSV.File("WHS_final_cleaned.csv"));

# set reference levels
WHSdata[!, :Smoker] = levels!(CategoricalArray(WHSdata[!, :Smoker]),
    ["Never"; "Past"; "Current"]);

WHSdata[!, :Race] = levels!(CategoricalArray(WHSdata[!, :Race]),
    ["White"; "African American"; "Asian"; "Hispanic"; "Native American"; "Other"])

WHSdata[!, :Wday] = levels!(CategoricalArray(WHSdata[!, :Wday]),
    ["Sun"; "Mon"; "Tues"; "Wed"; "Thurs"; "Fri"; "Sat"]);

In [None]:
# Write a function to compare mixed models with WiSER
function comparemixedmodel(mixedmodel, wsvarmodel)
    coefnames = MixedModels.coefnames(mixedmodel)
    mixedbeta = mixedmodel.β
    mixedbetapval = MixedModels.coeftable(mixedmodel).cols[4]
    wsvarbeta = wsvarmodel.β
    wsvarbetapval = WiSER.coeftable(wsvarmodel).cols[4][1:wsvarmodel.p] 
    return DataFrame(coefnames = coefnames, mixedbeta = mixedbeta,
        mixedbetapval = mixedbetapval, wsvarbeta = wsvarbeta,
        wsvarbetapval = wsvarbetapval)
end

In [None]:
wisermodel_transformed = WSVarLmmModel(
    @formula(Transformed_steps ~ 1 + BMI + Wday + Hour + 
                Race + Stairs + Age + Smoker + Season + m_total),
    @formula(Transformed_steps ~ 1 + Day_worn), 
    @formula(Transformed_steps ~ 1 + BMI + Wday + Hour + 
                Race + Age + Smoker), 
                :Subject_ID, WHSdata);
@time WiSER.fit!(wisermodel_transformed, KNITRO.KnitroSolver(outlev=0, ftol = 2), parallel = false, runs = 4)

In [None]:
@time mixedmodel_transformed = fit(LinearMixedModel, 
        @formula(Transformed_steps ~ 1 + BMI + Wday + Hour + 
            Race + Stairs + Age + Smoker + Season + m_total + (1 + Day_worn|Subject_ID)),
        WHSdata)

In [None]:
dfcompare_transformed = comparemixedmodel(mixedmodel_transformed, wisermodel_transformed) 

#### Supplementary Table S.3

The following obtains the results of summary statistics found in Supplementary Table S.3.

In [None]:
using DataFrames, CSV, StatsBase, Statistics

keepvars = [:BMI; :Steps; :Wday; :Hour; :Race; :Age; :Smoker; :Season; :Day_worn; :m_total; :Stairs]
WHSdata = DataFrame!(CSV.File("WHS_final_cleaned.csv"));
descrstats = dropmissing(WHSdata, keepvars)
describe(descrstats[!, [:Steps; :m_total; :BMI; :Genhealth; :Age; :Day_worn; :Season;
                :Stairs]], :mean, :std, :min, :q25, :median, :q75, :max)

In [None]:
# Race
countmap(combine(DataFrames.groupby(descrstats, :Subject_ID), :Race => first)[!, 2]),
proportionmap(combine(DataFrames.groupby(descrstats, :Subject_ID), :Race => first)[!, 2])

In [None]:
# Days worn
mean(combine(DataFrames.groupby(descrstats, :Subject_ID), :Day_worn => maximum)[!, 2]), 
std(combine(DataFrames.groupby(descrstats, :Subject_ID), :Day_worn => maximum)[!, 2])

In [None]:
# Day of week
countmap(descrstats[!, :Wday]), proportionmap(descrstats[!, :Wday])

In [None]:
# Season
countmap(descrstats[!, :Season]), proportionmap(descrstats[!, :Season])

In [None]:
# Smoking Status
countmap(combine(DataFrames.groupby(descrstats, :Subject_ID), :Smoker => first)[!, 2]),
    proportionmap(combine(DataFrames.groupby(descrstats, :Subject_ID), :Smoker => first)[!, 2])