In [1]:
using Pkg
pkg"add https://github.com/kose-y/ParProx.jl"
pkg"add Printf Statistics CSV Mmap CodecZlib ROCAnalysis DataFrames"
pkg"add CUDA Adapt"
pkg"add XLSX"

[32m[1m    Updating[22m[39m git-repo `https://github.com/kose-y/ParProx.jl`
[32m[1m    Updating[22m[39m registry at `~/.julia/registries/General.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.8/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.8/Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.8/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.8/Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.8/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.8/Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m   Installed[22m[39m Libiconv_jll ─ v1.16.1+2
[32m[1m   Installed[22m[39m XML2_jll ───── v2.10.3+0
[32m[1m   Installed[22m[39m EzXML ──────── v1.1.0


In [2]:
versioninfo()

Julia Version 1.8.5
Commit 17cfb8e65ea (2023-01-08 06:45 UTC)
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 32 × 13th Gen Intel(R) Core(TM) i9-13900KF
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-13.0.1 (ORCJIT, goldmont)
  Threads: 1 on 32 virtual cores
Environment:
  LD_LIBRARY_PATH = /usr/local/nvidia/lib:/usr/local/nvidia/lib64


In [3]:
using ParProx, Printf, Statistics # load the packages
using CSV, DataFrames, CodecZlib, Mmap # packages for data reading. GZip is used to read the gzipped text file.
using Random, CUDA, Adapt
using XLSX

In [4]:
sample_data = DataFrame(CSV.File("data/sample_data.txt"))
methylation_data = DataFrame(CSV.File("data/methylation_data.txt"))
group_info = DataFrame(CSV.File("data/group_info.txt"));

**Look into each data**

**Sample Data (Liver hepatocellular carcinoma study)**

- (429, 34)
- 429 results from 377 unique patients
- 52 patients had two biopsies and we use the first one of each patient

In [5]:
first(sample_data, 5)

Row,V1,bcr_patient_barcode,type,age_at_initial_pathologic_diagnosis,gender,race,ajcc_pathologic_tumor_stage,clinical_stage,histological_type,histological_grade,initial_pathologic_dx_year,menopause_status,birth_days_to,vital_status,tumor_status,last_contact_days_to,death_days_to,cause_of_death,new_tumor_event_type,new_tumor_event_site,new_tumor_event_site_other,new_tumor_event_dx_days_to,treatment_outcome_first_course,margin_status,residual_tumor,OS,OS.time,DSS,DSS.time,DFI,DFI.time,PFI,PFI.time,Redaction
Unnamed: 0_level_1,Int64,String15,String7,Int64?,String7,String,String15,String31,String,String15,Int64?,String15,Int64?,String7,String15,Int64?,Int64?,String15,String31,String15,String,Int64?,String15,String7,String15,Int64,Int64?,Int64?,Int64?,Int64?,Int64?,Int64,Int64?,String15?
1,5426,TCGA-2V-A95S,LIHC,missing,MALE,ASIAN,Stage II,[Not Applicable],Hepatocellular Carcinoma,G3,missing,[Not Available],missing,Alive,WITH TUMOR,missing,missing,[Not Available],,,,missing,[Not Available],,R0,0,missing,0,missing,0,missing,0,missing,missing
2,5427,TCGA-2Y-A9GS,LIHC,58,MALE,WHITE,[Discrepancy],[Not Applicable],Hepatocellular Carcinoma,G2,2006,[Not Available],-21318,Dead,WITH TUMOR,missing,724,[Not Available],Extrahepatic Recurrence,Lung,,102,[Not Available],,R0,1,724,1,724,1,102,1,102,missing
3,5428,TCGA-2Y-A9GT,LIHC,51,MALE,WHITE,Stage I,[Not Applicable],Hepatocellular Carcinoma,G2,2006,[Not Available],-18768,Dead,WITH TUMOR,missing,1624,[Not Available],Locoregional Recurrence,Liver,,1083,[Not Available],,R0,1,1624,1,1624,1,1083,1,1083,missing
4,5429,TCGA-2Y-A9GU,LIHC,55,FEMALE,WHITE,Stage I,[Not Applicable],Hepatocellular Carcinoma,G2,2009,[Not Available],-20187,Alive,TUMOR FREE,1939,missing,[Not Available],,,,missing,[Not Available],,R0,0,1939,0,1939,0,1939,0,1939,missing
5,5430,TCGA-2Y-A9GV,LIHC,54,FEMALE,WHITE,Stage I,[Not Applicable],Hepatocellular Carcinoma,G1,2007,[Not Available],-20011,Dead,WITH TUMOR,missing,2532,[Not Available],New Primary Tumor,"Other, specify",Renal met from prior lung primary,1745,[Not Available],,R0,1,2532,1,2532,0,2532,1,1745,missing


In [6]:
n_duplicates = sum(nonunique(sample_data))
println("There are " * string(n_duplicates) * " duplicated rows")

There are 52 duplicated rows


In [35]:
# unique_data = unique(sample_data, keep=:first)
# dummy coding for race
sample_data.black_or_african_american = sample_data.race .== "BLACK OR AFRICAN AMERICAN"
sample_data.asian = sample_data.race .== "ASIAN"
sample_data.american_indian_or_alaska_native = sample_data.race .== "AMERICAN INDIAN OR ALASKA NATIVE";
select!(sample_data, Not(:race));
# drop missed unpen variables
sample_data.gender = sample_data.gender .== "MALE"
unpen_var = [:gender, :black_or_african_american, :asian, :american_indian_or_alaska_native, :age_at_initial_pathologic_diagnosis]
dropmissing!(sample_data, unpen_var);

In [36]:
# Construct unpenalized data matrix ordered by time to event
survival_event = convert(Vector{Int}, sample_data[:, Symbol("OS")])
survival_time = convert(Vector{Int}, sample_data[:, Symbol("OS.time")])
sort_order = sortperm(survival_time; rev=true)

survival_event = survival_event[sort_order]
survival_time = survival_time[sort_order]
X_unpen = convert(
    Array{Float64},
    Matrix(select(sample_data[sort_order, :], unpen_var))
);

normalize(x) = (x .- mean(x; dims=1)) ./ std(x; dims=1)
X_unpen = normalize(X_unpen);

In [74]:
size(X_unpen)

(428, 5)

**Methylation Data**

- (289508, 430)
- level of methylation observed at each probe(row)
- used in data matrix

In [10]:
first(methylation_data, 5)

Row,Probe,TCGA-2V-A95S,TCGA-2Y-A9GS,TCGA-2Y-A9GT,TCGA-2Y-A9GU,TCGA-2Y-A9GV,TCGA-2Y-A9GW,TCGA-2Y-A9GX,TCGA-2Y-A9GY,TCGA-2Y-A9GZ,TCGA-2Y-A9H0,TCGA-2Y-A9H1,TCGA-2Y-A9H2,TCGA-2Y-A9H3,TCGA-2Y-A9H4,TCGA-2Y-A9H5,TCGA-2Y-A9H6,TCGA-2Y-A9H7,TCGA-2Y-A9H8,TCGA-2Y-A9H9,TCGA-2Y-A9HA,TCGA-2Y-A9HB,TCGA-3K-AAZ8,TCGA-4R-AA8I,TCGA-5C-A9VG,TCGA-5C-A9VH,TCGA-5C-AAPD,TCGA-5R-AA1C,TCGA-5R-AA1D,TCGA-5R-AAAM,TCGA-BC-4072,TCGA-BC-4073,TCGA-BC-A10Q,TCGA-BC-A10Q_1,TCGA-BC-A10R,TCGA-BC-A10R_1,TCGA-BC-A10S,TCGA-BC-A10S_1,TCGA-BC-A10T,TCGA-BC-A10T_1,TCGA-BC-A10U,TCGA-BC-A10U_1,TCGA-BC-A10W,TCGA-BC-A10W_1,TCGA-BC-A10X,TCGA-BC-A10X_1,TCGA-BC-A10Y,TCGA-BC-A10Y_1,TCGA-BC-A10Z,TCGA-BC-A10Z_1,TCGA-BC-A110,TCGA-BC-A110_1,TCGA-BC-A112,TCGA-BC-A112_1,TCGA-BC-A216,TCGA-BC-A216_1,TCGA-BC-A217,TCGA-BC-A3KF,TCGA-BC-A3KG,TCGA-BC-A5W4,TCGA-BC-A69H,TCGA-BC-A69I,TCGA-BC-A8YO,TCGA-BD-A2L6,TCGA-BD-A2L6_1,TCGA-BD-A3EP,TCGA-BD-A3EP_1,TCGA-BD-A3ER,TCGA-BW-A5NO,TCGA-BW-A5NP,TCGA-BW-A5NQ,TCGA-CC-5258,TCGA-CC-5259,TCGA-CC-5260,TCGA-CC-5261,TCGA-CC-5262,TCGA-CC-5263,TCGA-CC-5264,TCGA-CC-A123,TCGA-CC-A1HT,TCGA-CC-A3M9,TCGA-CC-A3MA,TCGA-CC-A3MB,TCGA-CC-A3MC,TCGA-CC-A5UC,TCGA-CC-A5UD,TCGA-CC-A5UE,TCGA-CC-A7IE,TCGA-CC-A7IF,TCGA-CC-A7IG,TCGA-CC-A7IH,TCGA-CC-A7II,TCGA-CC-A7IJ,TCGA-CC-A7IK,TCGA-CC-A7IL,TCGA-CC-A8HS,TCGA-CC-A8HT,TCGA-CC-A8HU,TCGA-CC-A8HV,TCGA-CC-A9FS,⋯
Unnamed: 0_level_1,String15,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,⋯
1,cg00000029,-0.184,-2.328,1.739,-3.696,-0.655,-1.381,-1.407,-1.034,-2.167,-1.531,-1.305,-2.652,-1.808,0.403,-2.276,-3.275,-0.588,-1.682,-0.551,-3.879,1.548,-1.147,0.639,-3.136,-1.862,-0.431,-0.023,-2.335,0.639,-1.423,-1.496,-0.913,1.417,-0.663,-3.681,-0.939,-0.706,-1.179,0.956,-0.621,1.714,-0.124,-2.659,-1.021,-0.959,-0.621,-2.456,-0.743,-1.556,-1.163,-1.063,-1.091,-1.404,-0.992,0.739,-1.055,-2.53,-2.575,-0.207,-1.836,-1.532,-1.954,-0.798,1.609,-0.977,-0.477,-0.346,-1.846,1.215,0.173,-0.615,-3.648,0.631,-0.881,-0.583,0.977,-1.916,-3.517,-0.292,-0.438,-3.466,-1.439,-2.19,-0.78,-2.623,-1.463,0.184,0.345,-0.01,0.61,-3.279,-0.418,-1.336,-4.036,-3.217,-1.287,-2.069,-1.549,1.358,⋯
2,cg00000236,2.887,3.343,3.529,3.446,3.649,3.301,3.158,3.006,3.372,3.061,3.112,2.944,3.122,3.283,3.482,2.665,3.429,2.922,3.196,2.342,3.463,2.951,4.037,2.795,1.697,3.498,2.749,3.329,2.37,2.783,2.649,2.479,3.327,3.078,3.124,2.676,3.408,3.043,2.791,2.71,3.41,2.854,3.289,2.174,2.905,2.847,3.573,2.758,2.661,2.546,2.75,2.855,3.053,3.273,3.511,3.261,3.207,3.569,3.733,2.663,2.889,2.403,2.825,2.634,3.593,3.545,3.536,2.878,3.308,3.113,2.436,3.574,2.19,3.704,2.315,2.731,3.248,2.525,1.588,3.569,3.05,3.609,3.045,2.621,2.81,2.842,2.755,3.245,3.29,2.453,1.528,2.919,2.916,3.708,3.05,2.358,2.828,3.034,2.271,⋯
3,cg00000289,0.082,1.993,2.261,1.303,1.758,1.746,0.842,0.761,1.214,1.545,1.243,1.022,1.453,1.357,1.969,0.843,1.864,0.897,0.007,0.845,1.195,2.062,2.499,0.716,-0.516,0.549,0.768,2.01,0.493,1.934,1.155,0.994,1.602,0.134,0.64,0.598,1.338,1.24,1.724,1.433,1.296,1.682,1.142,0.716,1.548,1.418,1.972,1.977,-0.559,1.845,1.414,1.269,0.591,1.966,2.695,2.074,0.661,0.773,1.245,1.115,0.554,0.476,1.465,1.774,2.002,1.612,1.307,1.159,0.978,1.274,1.36,-0.031,1.961,1.566,1.132,0.245,0.873,0.685,0.661,2.003,0.831,1.595,1.374,0.169,0.706,0.996,1.467,1.507,0.68,0.439,0.892,1.179,1.737,1.315,-0.305,0.236,-0.209,-0.184,0.059,⋯
4,cg00000292,2.579,0.279,0.715,-0.458,-0.828,1.48,0.612,0.249,-1.889,1.791,0.74,0.829,-0.26,2.004,1.364,0.692,0.55,2.412,2.822,0.366,1.823,2.207,1.241,0.467,2.179,0.328,3.358,0.772,1.575,0.493,0.913,1.163,3.856,1.601,1.601,0.714,2.369,1.593,1.804,1.676,0.139,1.986,0.967,2.074,1.28,1.508,1.756,1.455,-0.41,0.738,0.953,1.521,2.36,1.389,0.491,0.319,2.422,0.842,3.746,-0.926,2.202,-1.912,1.776,2.247,1.652,1.378,1.611,1.606,0.367,2.398,0.247,0.365,2.094,0.306,1.875,2.07,2.801,-0.14,1.343,2.954,-0.211,0.484,0.454,1.896,2.518,1.458,1.818,2.202,2.035,-1.336,-0.964,-0.791,0.445,-0.088,-0.416,-0.314,0.832,0.563,3.719,⋯
5,cg00000622,-6.005,-5.953,-6.021,-5.908,-5.868,-6.093,-6.017,-5.717,-6.135,-5.801,-6.014,-5.68,-5.841,-5.756,-5.586,-5.902,-6.196,-5.742,-5.874,-5.962,-5.978,-6.047,-5.93,-6.002,-6.049,-5.654,-6.302,-5.845,-6.27,-6.138,-5.849,-6.041,-5.967,-6.174,-6.294,-5.135,-6.395,-6.153,-6.095,-6.133,-5.57,-5.979,-5.869,-5.986,-6.646,-6.467,-6.195,-6.403,-5.764,-6.238,-6.587,-5.154,-6.289,-6.363,-6.212,-6.322,-6.287,-6.21,-5.632,-6.215,-6.316,-6.141,-5.841,-6.291,-6.323,-6.639,-6.16,-6.218,-5.705,-5.87,-6.114,-6.054,-3.779,-5.988,-5.964,-6.12,-6.039,-5.997,-6.266,-6.013,-6.219,-6.003,-6.197,-6.062,-6.014,-6.068,-5.988,-6.252,-6.04,-6.161,-5.979,-6.217,-5.837,-5.923,-5.82,-5.727,-5.976,-5.869,-6.144,⋯


**use default comparator to determine variable order**

In [11]:
sorted_methyl_data = sort(methylation_data, :Probe)
methyl_probe = permutedims(sorted_methyl_data, 1);

**left join to make data matrix X**

In [38]:
X = leftjoin(select(sample_data, :bcr_patient_barcode), methyl_probe, on = :bcr_patient_barcode => :Probe)
X = X[sort_order, :]
select!(X, Not(:bcr_patient_barcode))
X = convert(
    Array{Float64},
    Matrix(X)
    )
X = normalize(X);

In [75]:
X

428×289508 Matrix{Float64}:
 -0.490202    0.176366    0.72964    …  -1.27485    -1.17097     -1.51366
 -1.40529     0.23954     0.877111      -0.617118   -0.622416    -0.990704
 -0.514305   -1.4955      0.571246       0.217623    1.54421     -0.236567
 -0.514305   -1.4955      0.571246       0.217623    1.54421     -0.236567
 -0.643963   -0.342568   -0.792861       0.769148    0.235958     0.0583919
  0.226241   -0.414768    0.982253   …  -1.66427    -0.403844    -2.0669
 -0.742037    2.86129     1.28266       -1.92513    -0.885552    -1.63642
 -0.187667    0.555413    2.19206        0.705797    0.956398     0.299142
 -0.187667    0.555413    2.19206        0.705797    0.956398     0.299142
  0.588618   -1.09164    -1.44692        2.42745     0.755863     1.0485
 -0.0721383  -0.0943823   0.58217    …  -1.01027    -0.356097    -1.39727
  1.29592    -0.963033   -1.5862         2.01753     4.34003      0.649903
  0.163906   -1.26762    -0.136069       0.444941    1.37126      1.38172
  ⋮ 

**Group Info**

- (370468, 2)
- probe to group matching
- one to many relationship

In [14]:
first(group_info, 5)

Row,Probe,Group
Unnamed: 0_level_1,String15,String31
1,cg22568540,A1BG_1stExon
2,cg22568540,A1BG_5'UTR
3,cg03123289,A1BG_Body
4,cg03630821,A1BG_Body
5,cg08389151,A1BG_Body


**match order of column of data matrix X and probe idx of group info**

In [15]:
sorted_group_info = sort(group_info, :Probe)
all(sorted_methyl_data.Probe .== unique(sorted_group_info.Probe))

true

In [16]:
probe_idx = map(sorted_group_info.Probe) do x
    findfirst(item -> item == x, sorted_methyl_data.Probe)
end;

In [17]:
#group_to_variables = [Int[] for i in 1:length(unique_group)] 
df = DataFrame(
    probe_idx = probe_idx,
    group = sorted_group_info.Group
)
df = groupby(df, :group)
df = combine(df, [:probe_idx] => Ref)
sort!(df, :group);

In [18]:
group_to_probes = df[:, 2]
group_to_probes = convert(
    Vector{Vector{Int}},
    group_to_probes)

91834-element Vector{Vector{Int64}}:
 [238878]
 [238878]
 [37911, 43888, 97240, 121766, 138707, 157929, 236408, 241941, 268631]
 [90060]
 [35928, 51189, 191490]
 [45984, 256928]
 [45984, 256928]
 [286366]
 [6057, 19827, 25915, 37353, 47234, 288652]
 [2637]
 [6057, 6186, 10729, 19698, 19827, 23404, 24969, 25915, 26972, 35297  …  99308, 103634, 114139, 137334, 162988, 180771, 283245, 284922, 286187, 288652]
 [546, 1376, 5911, 6057, 6312, 6539, 7527, 12317, 13192, 27203  …  110884, 111748, 117681, 118410, 183682, 277001, 284408, 285171, 286892, 288652]
 [43339, 75350, 88171, 92283, 209278]
 ⋮
 [206884]
 [29807, 31181, 78723, 94738, 102702, 129557, 174048, 181125, 205436, 230763, 243112, 245508, 250108, 278120, 280040, 287630]
 [6555, 13053, 69732, 88647, 116354, 118627, 136183, 229415]
 [28881, 29842, 79285, 83630, 131484, 176672]
 [235805]
 [49488]
 [36309, 56276, 59893, 68121, 256389, 260424]
 [18377, 30215, 188789, 194835, 239130]
 [4121, 10295, 12862, 49307, 171856, 178601, 196362, 21

In [40]:
Random.seed!(222)
T = Float64
A = CuArray
U = ParProx.COXUpdate(; maxiter=10000, step=20, tol=5e-4, verbose=true)
lambdas = 10 .^ (range(-6, stop=-8, length=21))

scores = ParProx.cross_validate(U, 
    adapt(A{T}, X), adapt(A{T}, X_unpen), adapt(A{Int32}, survival_event), adapt(A{Int32}, survival_time),
    group_to_probes, lambdas, 5, T=T);

20	(-1.9168365617023408, Inf, 5281.0)
40	(-1.9148120146337677, 0.0006945720884945336, 4840.0)
60	(-1.9131227669721873, 0.0005798752049630094, 4495.0)
80	(-1.9116902849072763, 0.0004919761117232274, 4237.0)
  0.305474 seconds (108.36 k allocations: 5.050 MiB)
20	(-1.9077506155662474, Inf, 11879.0)
40	(-1.9048548204705558, 0.0009968811781177111, 10628.0)
60	(-1.9026197474993058, 0.0007700192121877146, 9682.0)
80	(-1.9008268847235703, 0.0006180523164540326, 8941.0)
100	(-1.8993507757082198, 0.0005091170850101747, 8292.0)
120	(-1.8980990505396997, 0.0004319124870100315, 7643.0)
  0.403916 seconds (122.49 k allocations: 5.738 MiB)
20	(-1.8943078312752013, Inf, 18385.0)
40	(-1.8914938879447771, 0.0009731797608689655, 16459.0)
60	(-1.8892822299848908, 0.0007654696854927463, 15032.0)
80	(-1.8874713894687576, 0.0006271371286093988, 13822.0)
100	(-1.8859424811275547, 0.0005297778286300222, 12927.0)
120	(-1.8846247897708572, 0.0004567981809523765, 12211.0)
  0.447007 seconds (122.16 k allocations

In [20]:
# Need to apply one standard error rule 
# sd_cv_error = sd(scores; dim=2)

In [69]:
lambda_idx = argmax(mean(scores; dims=2)[:])
# lambda = lambdas[lambda_idx]
lambda = lambdas[2] # force to select aroung 5e-7

7.943282347242822e-7

In [70]:
U = ParProx.COXUpdate(; maxiter=100000, step=20, tol=1e-5, verbose=true)
V = ParProx.COXVariables{Float64}(
    adapt(A{T}, X), adapt(A{T}, X_unpen), adapt(A{Int32}, survival_event), adapt(A{Int32}, survival_time),
    lambda, group_to_probes)
@time ParProx.fit!(U, V)

20	(-2.0195558471213144, Inf, 6923.0)
40	(-2.0176402430757037, 0.0006348019947063967, 6415.0)
60	(-2.0160577804869457, 0.0005246791354582231, 5986.0)
80	(-2.01472230984673, 0.00044298296922865526, 5571.0)
100	(-2.013575072849272, 0.0003806897023386325, 5236.0)
120	(-2.0125722367754637, 0.0003328836605363955, 4884.0)
140	(-2.011686076949406, 0.00029424043655813966, 4662.0)
160	(-2.010897169453772, 0.0002620174158180656, 4428.0)
180	(-2.010184842129294, 0.00023663906432201567, 4193.0)
200	(-2.009539095121619, 0.0002145667450280255, 4033.0)
220	(-2.0089497752870598, 0.00019585565681408951, 3883.0)
240	(-2.008408481967794, 0.0001799268026633906, 3752.0)
260	(-2.007910473128034, 0.00016556637712758293, 3620.0)
280	(-2.0074471844975865, 0.0001540471376639713, 3505.0)
300	(-2.007016189363327, 0.00014332983499860946, 3421.0)
320	(-2.0066161796676822, 0.00013304315274751834, 3350.0)
340	(-2.0062413425339334, 0.0001246863079305919, 3266.0)
360	(-2.005886349120076, 0.0001180994131602825, 3168.0)


In [71]:
_, grpmat, _ = ParProx.mapper_mat_idx(group_to_probes, length(sorted_methyl_data.Probe));
size(grpmat)

(289508, 370468)

In [72]:
β_orig = vcat(grpmat * collect(V.β[1:end-5]), collect(V.β)[end-4:end]);

In [73]:
pen_β = β_orig[1:end-5]
n_var = sum(pen_β .!= 0)
println("# of survived variables: $n_var")
for (v, β) in zip(sorted_methyl_data.Probe[pen_β .!= 0], pen_β[pen_β .!= 0])
    println("$v\t$β")
end

# of survived variables: 631
cg00016718	0.00035246161499338357
cg00036369	0.0006235265847095207
cg00037450	0.000351446502129293
cg00057663	0.0004598301642023888
cg00059015	0.00031586088105508203
cg00133629	0.00016193401011374244
cg00176876	0.002882270912054324
cg00214688	-0.0001866585165061177
cg00243922	-0.0001786452008955503
cg00261416	-0.000675113215283966
cg00294885	6.317268373542387e-5
cg00307483	0.0002802932777794237
cg00314741	0.000757196701339282
cg00324733	-0.0005607834800145947
cg00346716	0.0014515466871562754
cg00406621	-0.0001936061897975503
cg00423729	-0.00023431287022194626
cg00452079	9.421022697670166e-5
cg00459078	2.6767725754694833e-5
cg00471768	0.0011385124946335877
cg00499599	0.0002758431745403569
cg00533393	2.9222768838094334e-5
cg00673674	0.00030632391200424925
cg00775286	0.0007544443214614643
cg00815230	7.447620227690016e-5
cg00819233	-5.447178140042247e-5
cg00831633	-0.00017603950983566542
cg00913289	0.0008636816791677484
cg00953282	0.00047789320283703196
cg00956

**Given Result Table**

In [29]:
result_path = "supptable5_livercancer_dnamethyl_cox_regression_bbab256.xlsx"
glasso_bygroup = DataFrame(XLSX.readtable("data/"*result_path, "GLASSO_bygroup", 
        column_labels = ["var_group", "cpg_probe", "coefficients", "gene_name", "probe_type"],
        infer_eltypes=true))
glasso_byprobe = DataFrame(XLSX.readtable("data/"*result_path, "GLASSO_byprobe", 
        column_labels = ["cpg_probe", "coefficients"],
        infer_eltypes=true));

In [30]:
first(glasso_bygroup, 5)

Row,var_group,cpg_probe,coefficients,gene_name,probe_type
Unnamed: 0_level_1,String,String?,Float64,Any,String?
1,AATK_TSS1500,cg01633363,-0.00741813,AATK,TSS1500
2,AATK_TSS1500,cg09142578,-0.0066607,AATK,TSS1500
3,ABHD5_TSS1500,cg13606015,-0.00267391,ABHD5,TSS1500
4,ABHD5_TSS1500,cg14121103,-0.00270201,ABHD5,TSS1500
5,ABHD5_TSS1500,cg19150852,-0.00284719,ABHD5,TSS1500


In [31]:
first(glasso_byprobe, 5)

Row,cpg_probe,coefficients
Unnamed: 0_level_1,String,Float64
1,cg00057663,0.0079717
2,cg00119057,-0.0117733
3,cg00170536,0.0170406
4,cg00176876,0.0079252
5,cg00201257,0.0033105


In [32]:
glasso_bygroup[glasso_bygroup.cpg_probe .=== "cg01091258", :]

Row,var_group,cpg_probe,coefficients,gene_name,probe_type
Unnamed: 0_level_1,String,String?,Float64,Any,String?
1,CRIPT_Body,cg01091258,0.00399591,CRIPT,Body
2,PIGF_TSS1500,cg01091258,0.00399591,PIGF,TSS1500


In [33]:
glasso_byprobe[glasso_byprobe.cpg_probe .=== "cg01091258", :]

Row,cpg_probe,coefficients
Unnamed: 0_level_1,String,Float64
1,cg01091258,0.00799181
