# Load gene attributes

In [1]:
using CSV
using DataFrames
using Statistics

df = CSV.File("../KIDNEY/node_attributes_noNA.csv", typemap=Dict(Int => Float64)) |> DataFrame    # read the attribute file
df = unique(df, :name)      # remove possible duplicated genes
genenames = df[!, :name]
first(df, 15)

Unnamed: 0_level_0,name,label,label_wo_outliers
Unnamed: 0_level_1,String,String3?,String3?
1,(clone tec14),missing,missing
2,100 kDa coactivator,missing,missing
3,14-3-3 tau splice variant,missing,missing
4,3-beta-hydroxysteroid dehydrogese,missing,missing
5,3-mercaptopyruvate sulfurtransferase variant,missing,missing
6,3'-phosphoadenosine-5'-phosphosulfate synthase,missing,missing
7,40S ribosomal protein S15a,missing,missing
8,5-aminoimidazole-4-carboxamide ribonucleotide formyltransferase,missing,missing
9,5'-3' exoribonuclease,missing,missing
10,6-O-methylguanine-D methyltransferase,missing,missing


In [2]:
df = df[typeintersect.(colwise(eltype, df), Number) .!= Union{}]      # select only numeric attributes
for col in eachcol(df)                                                # fix missing and naN with mean value
    m = mean(collect(skipmissing(col)))
    std = mean(collect(skipmissing(col)))
    replace!(col, missing => m)
    replace!(col, NaN => m)
    convert.(Float64,col)
end
df = mapcols!(ByRow(Float64), df)                                  # convert Union{missing,Float64} to Float64 types

Unnamed: 0_level_0,gene_length,transcript_count,gc_content,Gtex_kidney,gene_disease_ass_count
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64
1,70061.3,8.86662,46.6611,49.9335,8.87569
2,70061.3,8.86662,46.6611,49.9335,8.87569
3,70061.3,8.86662,46.6611,49.9335,8.87569
4,70061.3,8.86662,46.6611,49.9335,8.87569
5,70061.3,8.86662,46.6611,49.9335,8.87569
6,70061.3,8.86662,46.6611,49.9335,8.87569
7,70061.3,8.86662,46.6611,49.9335,8.87569
8,70061.3,8.86662,46.6611,49.9335,8.87569
9,70061.3,8.86662,46.6611,49.9335,8.87569
10,70061.3,8.86662,46.6611,49.9335,8.87569


### zscore attributes

In [3]:
using StatsBase
dt = fit(ZScoreTransform, Matrix(df), dims=2)
x = StatsBase.transform(dt, Matrix(df)) #|> DataFrame
df = DataFrame(x, names(df))
df[!, :name] = genenames

19296-element Vector{String}:
 "(clone tec14)"
 "100 kDa coactivator"
 "14-3-3 tau splice variant"
 "3-beta-hydroxysteroid dehydrogese"
 "3-mercaptopyruvate sulfurtransferase variant"
 "3'-phosphoadenosine-5'-phosphosulfate synthase"
 "40S ribosomal protein S15a"
 "5-aminoimidazole-4-carboxamide ribonucleotide formyltransferase"
 "5'-3' exoribonuclease"
 "6-O-methylguanine-D methyltransferase"
 "A0A087WZY1"
 "A0A0G2JLL6"
 "A0A0J9YVX5"
 ⋮
 "ZUP1"
 "ZW10"
 "ZWILCH"
 "ZWINT"
 "ZXDA"
 "ZXDB"
 "ZXDC"
 "ZYG11A"
 "ZYG11B"
 "ZYX"
 "ZZEF1"
 "ZZZ3"

## Load labels

In [4]:
dfl = CSV.File("../KIDNEY/node_labels.csv", typemap=Dict(Int => Float64)) |> DataFrame    # read the attribute file
dfl = unique(dfl, :name)         # remove gene duplicates
dfl = filter(:most_freq => !ismissing, dfl)

Unnamed: 0_level_0,name,ACH-000159,ACH-000189,ACH-000234,ACH-000246,ACH-000250,ACH-000262
Unnamed: 0_level_1,String15,String3,String3,String3,String3,String3,String3
1,A1BG,aNE,NE,aNE,aNE,aNE,aNE
2,A1CF,aNE,NE,aNE,aNE,aNE,NE
3,A2M,NE,aNE,aNE,aNE,NE,NE
4,A2ML1,aNE,NE,NE,aNE,NE,NE
5,A3GALT2,NE,NE,aNE,aNE,aNE,aNE
6,A4GALT,NE,NE,aNE,aNE,aNE,NE
7,A4GNT,NE,aNE,aNE,aNE,NE,NE
8,AAAS,aNE,aNE,aNE,aNE,aNE,aNE
9,AACS,aNE,aNE,aNE,aNE,aNE,aNE
10,AADAC,NE,NE,aNE,NE,NE,aNE


## Select labels

In [5]:
dfl = filter(row -> row.most_freq ∈ ["E","NE"], dfl)

Unnamed: 0_level_0,name,ACH-000159,ACH-000189,ACH-000234,ACH-000246,ACH-000250,ACH-000262
Unnamed: 0_level_1,String15,String3,String3,String3,String3,String3,String3
1,A2ML1,aNE,NE,NE,aNE,NE,NE
2,AADAC,NE,NE,aNE,NE,NE,aNE
3,AADACL3,NE,NE,NE,NE,NE,NE
4,AADACL4,NE,aNE,aNE,NE,NE,aNE
5,AAGAB,NE,NE,aNE,NE,NE,NE
6,AARS1,E,E,E,E,E,E
7,AASS,aNE,NE,NE,aNE,NE,aNE
8,ABCA10,aNE,NE,NE,NE,aNE,NE
9,ABCA7,NE,NE,aNE,aNE,NE,NE
10,ABCB7,aE,aE,aE,E,aE,E


In [6]:
genenames = dfl[!, :name]
dfall = innerjoin(dfl,df; on=:name)[!, Not(r"ACH-")]         # join labels and attributes dataframes

Unnamed: 0_level_0,name,most_freq,gene_length,transcript_count,gc_content,Gtex_kidney,gene_disease_ass_count
Unnamed: 0_level_1,String15,String3,Float64,Float64,Float64,Float64,Float64
1,A2ML1,NE,5.1025,-0.18949,-0.186594,-0.190227,-0.189655
2,AADAC,NE,5.1021,-0.191903,-0.179672,-0.192628,-0.192271
3,AAGAB,NE,5.10242,-0.19044,-0.187288,-0.189219,-0.190731
4,AARS1,E,5.10231,-0.186921,-0.185857,-0.185807,-0.192535
5,AASS,NE,5.10247,-0.188652,-0.188019,-0.189985,-0.189515
6,ABCA10,NE,5.10252,-0.18866,-0.187407,-0.189302,-0.189257
7,ABCA7,NE,5.10233,-0.188907,-0.180102,-0.191129,-0.190564
8,ABCB7,E,5.10249,-0.189417,-0.188465,-0.189763,-0.189817
9,ABCC1,NE,5.10251,-0.18946,-0.188325,-0.189396,-0.189104
10,ABCC11,NE,5.10251,-0.189012,-0.186644,-0.189591,-0.189529


# Load embedding

In [93]:
dfe = CSV.File("../KIDNEY/embeddings/PPI_Node2Vec_64.csv") |> DataFrame    # read the attribute file
dfall = innerjoin(dfall,dfe; on=:name)               # join labels/attributes dataframe with embedding

LoadError: ArgumentError: Duplicate variable names: :Node2Vec_1, :Node2Vec_2, :Node2Vec_3, :Node2Vec_4, :Node2Vec_5, :Node2Vec_6, :Node2Vec_7, :Node2Vec_8, :Node2Vec_9, :Node2Vec_10, :Node2Vec_11, :Node2Vec_12, :Node2Vec_13, :Node2Vec_14, :Node2Vec_15, :Node2Vec_16, :Node2Vec_17, :Node2Vec_18, :Node2Vec_19, :Node2Vec_20, :Node2Vec_21, :Node2Vec_22, :Node2Vec_23, :Node2Vec_24, :Node2Vec_25, :Node2Vec_26, :Node2Vec_27, :Node2Vec_28, :Node2Vec_29, :Node2Vec_30, :Node2Vec_31, :Node2Vec_32, :Node2Vec_33, :Node2Vec_34, :Node2Vec_35, :Node2Vec_36, :Node2Vec_37, :Node2Vec_38, :Node2Vec_39, :Node2Vec_40, :Node2Vec_41, :Node2Vec_42, :Node2Vec_43, :Node2Vec_44, :Node2Vec_45, :Node2Vec_46, :Node2Vec_47, :Node2Vec_48, :Node2Vec_49, :Node2Vec_50, :Node2Vec_51, :Node2Vec_52, :Node2Vec_53, :Node2Vec_54, :Node2Vec_55, :Node2Vec_56, :Node2Vec_57, :Node2Vec_58, :Node2Vec_59, :Node2Vec_60, :Node2Vec_61, :Node2Vec_62, :Node2Vec_63 and :Node2Vec_64. Pass makeunique=true to make them unique using a suffix automatically.

In [101]:
X = Matrix(dfall[!, Not(r"name|most_freq")])
y = vec(Matrix(DataFrames.select(dfall, [:most_freq])))
typeof(X), size(X), typeof(y), size(y)

(Matrix{Float64}, (5661, 92), Vector{String3}, (5661,))

In [104]:
using MLJ
using LightGBM
using StableRNGs
LIGHTGBM_SOURCE = abspath("~/LightGBM-3.2.0")
X, y = @load_iris;
display(X)
rng = StableRNG(566)
train, test = partition(eachindex(y), 0.7, shuffle=true, rng=rng)
model = LightGBM.MLJInterface.LGBMClassifier()
mach = machine(model, X, y)

MLJ.fit!(mach, rows=train) 
MLJ.predict(mach, rows=test)

(sepal_length = [5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9  …  6.7, 6.9, 5.8, 6.8, 6.7, 6.7, 6.3, 6.5, 6.2, 5.9],
 sepal_width = [3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1  …  3.1, 3.1, 2.7, 3.2, 3.3, 3.0, 2.5, 3.0, 3.4, 3.0],
 petal_length = [1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5  …  5.6, 5.1, 5.1, 5.9, 5.7, 5.2, 5.0, 5.2, 5.4, 5.1],
 petal_width = [0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1  …  2.4, 2.3, 1.9, 2.3, 2.5, 2.3, 1.9, 2.0, 2.3, 1.8],)

┌ Info: Training [34mMachine{LGBMClassifier,…} @230[39m.
└ @ MLJBase /Users/maurizio/.julia/packages/MLJBase/hLtde/src/machines.jl:342
┌ Error: Problem fitting the machine [34mMachine{LGBMClassifier,…} @230[39m. 
└ @ MLJBase /Users/maurizio/.julia/packages/MLJBase/hLtde/src/machines.jl:472
┌ Info: Running type checks... 
└ @ MLJBase /Users/maurizio/.julia/packages/MLJBase/hLtde/src/machines.jl:478
┌ Info: Type checks okay. 
└ @ MLJBase /Users/maurizio/.julia/packages/MLJBase/hLtde/src/machines.jl:481


LoadError: ArgumentError: NULL library handle

In [99]:
using ScikitLearn.CrossValidation: cross_val_predict
using DecisionTree
using MLJBase
using MLJ

using LightGBM
LIGHTGBM_SOURCE = abspath("~/LightGBM-3.2.0")
model = LightGBM.MLJInterface.LGBMClassifier

function confusionmatrix(predictions, labels)
   classes = vec(unique(labels))
   d = size(classes)[1]
   idx = Dict(zip(classes,Vector(1:d)))
   c = zeros(Int64, d,d)
   for i in 1:size(labels)[1]
       c[idx[labels[i]] ,idx[predictions[i]]] += 1 
   end 
   df = DataFrame(c, classes)
   df[!, :name] = classes
   return df
end
model = DecisionTree.RandomForestClassifier()
ŷ = CategoricalArray(cross_val_predict(model, X, y; cv=5))
y = CategoricalArray(y)
println("Accuracy: \t$(MLJBase.Accuracy()(ŷ,y))")
println("Balanced Acc: \t$(MLJBase.BalancedAccuracy()(ŷ,y))")
println("MCC: \t\t$(MLJBase.MatthewsCorrelation()(ŷ,y))")
confusionmatrix(ŷ,y)

LoadError: MethodError: no method matching fit!(::Type{LightGBM.MLJInterface.LGBMClassifier}, ::Matrix{Float64}, ::CategoricalVector{String3, UInt32, String3, CategoricalValue{String3, UInt32}, Union{}})