In [370]:
using Pkg
Pkg.add(PackageSpec(path="https://github.com/diegozea/ROC.jl"))

[32m[1m   Cloning[22m[39m git-repo `https://github.com/diegozea/ROC.jl`
[2K[?25h[32m[1m  Updating[22m[39m git-repo `https://github.com/diegozea/ROC.jl`.0 %46.3 %>  ]  93.1 %
[32m[1m Installed[22m[39m Infinity ─ v0.2.3
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Project.toml`
 [90m [e4f92426][39m[92m + ROC v0.1.0 #master (https://github.com/diegozea/ROC.jl)[39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Manifest.toml`
 [90m [a303e19e][39m[92m + Infinity v0.2.3[39m
 [90m [e4f92426][39m[92m + ROC v0.1.0 #master (https://github.com/diegozea/ROC.jl)[39m


In [391]:
using JuMP, Gurobi, CSV, LinearAlgebra, DataFrames, Random, Distributions, Statistics,MLBase, CPUTime,ScikitLearn ,MLDataUtils
@sk_import metrics: roc_auc_score
gurobi_env = Gurobi.Env()

Academic license - for non-commercial use only


Gurobi.Env(Ptr{Nothing} @0x00007fa9e453c000)

In [429]:
function one_hot_encode(X, names)
    X2 = deepcopy(X)
    select!(X2, Not(Symbol.(names)))
    for i in names
        vales = unique(X[i])
        for j in 1:length(vales)-1
           X2[Symbol(string(i)*"_"*string(vales[j]))] = (X[i].==vales[j])*1
        end
    end
    return X2
end

function normalize(X, names)
    X2 = deepcopy(X)
    select!(X2, Not(Symbol.(names)))
    for j in names
        X2[j] = (X[:,j] .- mean(X[:,j])) / std(X[:,j])
    end
    return X2
end

function clean(X)
    n,p = size(X)
    i = 0
    while i < n
        i += 1
        if "?" in X[i,:]
            X = X[1:end .!= i, :]
            i -= 1
        end
        n,_ = size(X)
    end
    return X
end     

function toNum(df, names)
    n,p = size(df)
    for name in names
        if !(isa(df[1,name], Int64) || isa(df[1,name], Float64))
            temp = zeros(n)
            for i=1:n
                temp[i] = parse(Float64,df[i,name])
            end
            df[!,name] = temp
        end
    end
    return df
end

function preprocess(df, categorical_vars, numerical_vars)
    df = clean(df)
    df = toNum(df,numerical_vars)
    df = normalize(df,numerical_vars)
    df = one_hot_encode(df[:,1:end], categorical_vars) 
    df[df[:,end].==0,end] .= -1
    return df
end

preprocess (generic function with 1 method)

In [1063]:
### Utils Functions ###
# function compute_∇f(w_k, y, X, λ)
#     n, p = size(X)
#     temp = zeros(p)
#     for i in 1:n
#         expo = log(1+exp(-y[i]*dot(X[i,:], w_k)+ λ*dot(w_k,w_k)))
#         if expo == Inf
#             expo = -y[i]*dot(X[i,:], w_k)+ λ*dot(w_k,w_k)
#         end
#          Δ = (1/(1+t))*t*(-y[i]*Array(X[i,:]) .+ 2*λ*w_k)
# #        Δ = (-1/(1+exp(y[i]*dot(w_k,X[i,:])+λ*transpose(w_k)*w_k)))*(y[i].*X[i,:] .+ 2*λ*w_k)
#         temp = temp + Δ

#     end

#     ∇f_k = temp
#     return ∇f_k
# end
function compute_∇f(w_k, y, X, λ)
    n, p = size(X)
    temp = zeros(p)
    for i in 1:n
        t = exp(-y[i]*(transpose(w_k)*Array(X[i,:]))+λ*transpose(w_k)*w_k)
#       t = exp(-y[i]*(transpose(w_k)*Array(X[i,:]))+λ*transpose(w_k)*w_k)

        if t == Inf
            Δ = -y[i]*Array(X[i,:]) .+ 2*λ*w_k
        else
            Δ = (1/(1+t))*t*(-y[i]*Array(X[i,:]) .+ 2*λ*w_k)
        end
        temp = temp + Δ
    end
    ∇f_k = temp
    return ∇f_k
end

function compute_f(w_k, y, X, λ)
    n, p = size(X)
    taylor = 0
    f_k = 0
    for i in 1:n
        t = exp(-y[i]*dot(X[i,:], w_k)+λ*transpose(w_k)*w_k)
        if t == Inf 
            f_k += -y[i]*dot(X[i,:], w_k)+λ*transpose(w_k)*w_k
        else
            f_k += log(1+t)
        end
    end

    return f_k
end

compute_f (generic function with 1 method)

In [1095]:
### Cutting Planes Implementation ###
function LR_cutting_planes(y, X, ε, λ)
    errors = []
    n, p = size(X)
    # Initialization values and step 0
    w_0 = [0 for i in 1:p]
    #w_0 = [rand(Uniform(-0.5, 0.5)) for i in 1:p]
    f_0 = sum(log(1+exp(-y[i]*dot(X[i,:], w_0)+λ*transpose(w_0)*w_0)) for i=1:n)
    ∇f_0 = compute_∇f(w_0, y, X, λ)

    # Outer minimization problem
    outer_min_model = Model(solver=GurobiSolver(OutputFlag=0, gurobi_env))
    @variable(outer_min_model, t >= 0)
    @variable(outer_min_model, w[1:p])
    #@constraint(outer_min_model, [j=1:p], -1 <= w[j] <= 1)
    @constraint(outer_min_model, t >= f_0 + (dot(∇f_0, w)-dot(∇f_0, w_0)))
    @constraint(outer_min_model, [j=1:p], 10 >= w[j])
    @constraint(outer_min_model, [j=1:p], w[j] >= -10)
    @objective(outer_min_model, Min, t)
    k = 1 # Number of constraints in the final problem
    solve(outer_min_model)

    # New steps k
    t_k = getvalue(t)
    w_k = getvalue(w)

    f_k = compute_f(w_k, y, X, λ)
    #f_k = sum(min(log(1+exp(-y[i]*dot(X[i,:], w_k)+λ*transpose(w_k)*w_k)),taylor) for i=1:n)
    
    ∇f_k = compute_∇f(w_k, y, X, λ)
    
    while abs(f_k - t_k) >= ε # error

        push!(errors, f_k - t_k)
        @constraint(outer_min_model,t >= f_k +(dot(∇f_k, w)-dot(∇f_k, w_k)))
        k += 1
        solve(outer_min_model)
        # Updating all the values
        t_k = getvalue(t)
        w_k = getvalue(w)
        
        f_k = compute_f(w_k, y, X, λ)
        #f_k = sum(min(log(1+exp(-y[i]*dot(X[i,:], w_k)+λ*transpose(w_k)*w_k)),100) for i=1:n)

        ∇f_k = compute_∇f(w_k, y, X, λ)
         if k%500 == 0
             println("Number of constraints: ", k, "\t Error = ", abs(t_k - f_k))
         end
        if k > 20000
            break
        end
    end
    push!(errors, f_k - t_k)
    return t_k, f_k, w_k, errors
end


LR_cutting_planes (generic function with 1 method)

In [1162]:
function robust_LG_cv(X, y, ε, lambda_vals; method=LR_cutting_planes, nfold = 5)
    n,p = size(X)
    permuted_indices = randperm(n)
    assignments = repeat(1:nfold, convert(Int,ceil(n/nfold)))[1:n]
    AUCs = zeros(length(lambda_vals))
    for fold in 1:nfold
        println("nfold=",fold)
        train_indices, valid_indices = permuted_indices[assignments.!=fold], permuted_indices[assignments.==fold]

        X_train, y_train = X[train_indices,:], y[train_indices]
        X_valid, y_valid = X[valid_indices,:], y[valid_indices] 
         
        for (i,λ) in enumerate(lambda_vals)
            println(i)
            t, f, w, e = method(y_train, X_train, ε, λ)
    #         pred = 1 ./ (1 .+ exp.(-(Matrix(X_valid)*w).+λ*transpose(w)*w)) .> 0.5
    #         accuracies[i] = 1-sum(pred .!= (y_valid .== 1))/length(y_valid)
            pred_prob = 1 ./ (1 .+ exp.(-(Matrix(X_valid)*w).+λ*transpose(w)*w))
            AUCs[i] += roc_auc_score(y_valid ,pred_prob)
        end
    end
    IJulia.clear_output()

    i_best = argmax(AUCs)
    t, f, w_best, e = method(y, X, ε, lambda_vals[i_best])
    return w_best, lambda_vals[i_best]
end

robust_LG_cv (generic function with 1 method)

In [1163]:
function robust_LG_valid(X, y, ε, lambda_vals; method=LR_cutting_planes, split_at=0.8)
    n,p = size(X)
    split = convert(Int,floor(split_at*n))
    for 
    permuted_indices = randperm(n)
    train_indices, valid_indices = permuted_indices[1:split], permuted_indices[split+1:end]
    X_train, y_train = X[train_indices,:], y[train_indices]
    y_valid, y_valid = X[valid_indices,:], y[valid_indices]
    
    AUCs = zeros(length(lambda_vals))
    for (i,λ) in enumerate(lambda_vals)
        println(i)
        t, f, w, e = method(y_train, X_train, ε, λ)
#         pred = 1 ./ (1 .+ exp.(-(Matrix(X_valid)*w).+λ*transpose(w)*w)) .> 0.5
#         accuracies[i] = 1-sum(pred .!= (y_valid .== 1))/length(y_valid)
        pred_prob = 1 ./ (1 .+ exp.(-(Matrix(y_valid)*w).+λ*transpose(w)*w))
        AUCs[i] = roc_auc_score(y_valid ,pred_prob)
    end
    IJulia.clear_output()

    i_best = argmax(AUCs)
    t, f, w_best, e = method(y, X, ε, lambda_vals[i_best])
    return w_best, lambda_vals[i_best]
end

LoadError: syntax: incomplete: "function" at none:1 requires end

In [1169]:
function test_cv(df, categorical_vars, numerical_vars, test_split, validation_split, ε, lambda_vals, cv, seed)
    Random.seed!(seed)
    
    n,_ = size(df)
    permuted_indices = randperm(n)
    test_split = convert(Int,floor(validation_split*n))
    train_indices, test_indices = permuted_indices[1:test_split], permuted_indices[test_split+1:end]
    train = df[train_indices,:]
    test = df[test_indices,:]
    train_X = train[:,1:end-1]
    train_y = train[:,end]
    test_X = test[:,1:end-1]
    test_y = test[:,end]
    IJulia.clear_output()
    
    println("Enter cross-validation")
    start = time()
    w, λ = robust_LG_cv(train_X, train_y, ε, lambda_vals; method=LR_cutting_planes, nfold=cv)
    elapsed = time() - start
    
    pred_prob = 1 ./ (1 .+ exp.(-(Matrix(test_X)*w) .+ λ*transpose(w)*w))
    pred = pred_prob.> 0.5
    accuracy = 1-sum(pred .!= (test_y .== 1))/length(test_y)
    auc = roc_auc_score(test_y ,pred_prob)
    IJulia.clear_output()

    return auc, accuracy, λ, elapsed, w
end

test_cv (generic function with 1 method)

In [1170]:
function test(df, categorical_vars, numerical_vars, test_split, validation_split, ε, lambda_vals, seed)
    Random.seed!(seed)
    start = time()
    n,_ = size(df)
    permuted_indices = randperm(n)
    test_split = convert(Int,floor(validation_split*n))
    train_indices, test_indices = permuted_indices[1:test_split], permuted_indices[test_split+1:end]
    train = df[train_indices,:]
    test = df[test_indices,:]
    train_X = train[:,1:end-1]
    train_y = train[:,end]
    test_X = test[:,1:end-1]
    test_y = test[:,end]
    
    IJulia.clear_output()
    println("Enter cross-validation")
    w, λ, errors  = robust_LG_valid(train_X, train_y, ε, lambda_vals; method=LR_cutting_planes, split_at=validation_split)
    elapsed = time() - start
    pred_prob = 1 ./ (1 .+ exp.(-(Matrix(test_X)*w).+λ*transpose(w)*w))
    pred = pred_prob.> 0.5
    accuracy = 1-sum(pred .!= (test_y .== 1))/length(test_y)
    auc = roc_auc_score(test_y ,pred_prob)
    IJulia.clear_output()

    return auc, accuracy, λ, elapsed, w
end

test (generic function with 3 methods)

In [1171]:
df = CSV.read("Data/caesarian.csv";header=true)
size(df)

(80, 6)

In [1172]:
lambda_vals = [0.00001,0.0001,0.0005,0.001,0.005,0.01,0.05,0.1]
categorical_vars = Symbol.(["Delivery number" ;"Delivery time";"Blood of Pressure";"Heart Problem"])
numerical_vars = Symbol.(["Age"])
df = preprocess(df, categorical_vars, numerical_vars)
n,p=size(df)

(80, 10)

In [1173]:
seed = 1
auc, acc, lambda, elapsed, w = test_cv(df,categorical_vars,numerical_vars,0.75,0.75,0.0001, lambda_vals, 5, 1)
print("AUC: ",auc," Accuracy: ",acc, " λ: ", lambda, " Time: ", elapsed)

AUC: 0.7676767676767677 Accuracy: 0.6 λ: 0.1 Time: 6.100132942199707

In [1161]:
seed = 1
auc, acc, lambda, elapsed, w = test(df,categorical_vars,numerical_vars,0.75,0.75,0.0001, lambda_vals, 1)
print("AUC: ",auc," Accuracy: ",acc, " λ: ", lambda, " Time: ", elapsed)

AUC: 0.7575757575757576 Accuracy: 0.6 λ: 1.0e-5 Time: 1.3956480026245117

In [1101]:
w

9-element Array{Float64,1}:
 -1.2491146835999314 
 -0.39400162440683767
  0.8613423821765253 
 -0.32102663050231073
 -0.8139954311049188 
  0.6009822082585471 
  1.3343957893626073 
  0.4155759958315488 
  1.4982238682425273 

In [1102]:
seed = 2
auc, acc, lambda, elapsed = test(df,categorical_vars,numerical_vars,0.75,0.75,0.0001, lambda_vals, seed)
print("AUC: ",auc," Accuracy: ",acc, " λ: ", lambda, " Time: ", elapsed)

AUC: 0.7637362637362638 Accuracy: 0.7 λ: 1.0e-5 Time: 1.3824620246887207

In [1103]:
seed = 3
auc, acc, lambda, elapsed = test(df,categorical_vars,numerical_vars,0.75,0.75,0.0001, lambda_vals, seed)
print("AUC: ",auc," Accuracy: ",acc, " λ: ", lambda, " Time: ", elapsed)

AUC: 0.41500000000000004 Accuracy: 0.35 λ: 0.05 Time: 1.432671070098877

In [1104]:
seed = 4
auc, acc, lambda, elapsed = test(df,categorical_vars,numerical_vars,0.75,0.75,0.0001, lambda_vals, seed)
print("AUC: ",auc," Accuracy: ",acc, " λ: ", lambda, " Time: ", elapsed)

AUC: 0.609375 Accuracy: 0.5 λ: 1.0e-5 Time: 1.4455699920654297

In [1105]:
df = CSV.read("Data/monks-1.test";header=false)[:,2:end]
select!(df, Not(Symbol.("Column9")))
df[!, 1], df[!, end] = df[!, end], df[!, 1]
size(df)

(432, 7)

In [1106]:
lambda_vals = [0.0001,0.0005,0.001,0.005,0.01,0.05,0.1,0.15]
categorical_vars = propertynames(df)
numerical_vars = []
df = preprocess(df, categorical_vars, numerical_vars)
n,p=size(df)

(432, 12)

In [1107]:
seed = 1
auc, acc, lambda, elapsed, w = test(df,categorical_vars,numerical_vars,0.75,0.75,0.0001, lambda_vals, seed)
print("AUC: ",auc," Accuracy: ",acc, " λ: ", lambda, " Time: ", elapsed)

AUC: 0.7127586206896552 Accuracy: 0.8055555555555556 λ: 0.0001 Time: 10.483556985855103

In [1108]:
w

11-element Array{Float64,1}:
 -0.19944295285656555
 -0.33591579233249164
 -0.2301026230951267 
 -0.39198603439158647
 -0.10837953319927252
  0.1260196467374425 
 -0.11952095188224586
 -0.1318511976705135 
  6.774467365172956  
 -0.11512308916662675
 -0.17504668832152295

In [1109]:
seed = 2
auc, acc, lambda, elapsed, w = test(df,categorical_vars,numerical_vars,0.75,0.75,0.0001, lambda_vals, seed)
print("AUC: ",auc," Accuracy: ",acc, " λ: ", lambda, " Time: ", elapsed)

AUC: 0.7620192307692308 Accuracy: 0.7777777777777778 λ: 0.1 Time: 10.872619867324829

In [1110]:
w

11-element Array{Float64,1}:
 -0.08047366342341884 
 -0.06486894638733899 
  0.01632846139507369 
 -0.08297882024619946 
  0.017129200465058517
  0.02125276026886155 
 -0.05372634151023013 
 -0.018441210454888616
  0.8516057927454216  
 -0.21500761774344815 
 -0.22731284413419067 

In [1111]:
seed = 3
auc, acc, lambda, elapsed = test(df,categorical_vars,numerical_vars,0.75,0.75,0.0001, lambda_vals, seed)
print("AUC: ",auc," Accuracy: ",acc, " λ: ", lambda, " Time: ", elapsed)

AUC: 0.6411663807890223 Accuracy: 0.7222222222222222 λ: 0.0001 Time: 11.662354946136475

In [1131]:
df = CSV.read("Data/credit-screening/crx.data";header=false)
n,p=size(df)

(690, 16)

In [1132]:
categorical_vars = propertynames(df[1,vcat(1,4:7,9:10,12:13,16)])
numerical_vars = propertynames(df[1,vcat(2:3,8,11,14:15)])
lambda_vals = [0.0001,0.0005,0.001,0.005,0.01]
df = preprocess(df, categorical_vars, numerical_vars)
n,p=size(df)

(653, 38)

In [1133]:
seed = 1
auc, acc, lambda, elapsed,w = test(df,categorical_vars,numerical_vars,0.75,0.75,0.0001, lambda_vals, seed)
print("AUC: ",auc," Accuracy: ",acc, " λ: ", lambda, " Time: ", elapsed)

AUC: 0.917298824929347 Accuracy: 0.8780487804878049 λ: 0.0001 Time: 283.30889797210693

In [1134]:
seed = 2
auc, acc, lambda, elapsed,w = test(df,categorical_vars,numerical_vars,0.75,0.75,0.0001, lambda_vals, seed)
print("AUC: ",auc," Accuracy: ",acc, " λ: ", lambda, " Time: ", elapsed)

AUC: 0.9585037104346509 Accuracy: 0.8841463414634146 λ: 0.005 Time: 302.3048839569092

In [1135]:
seed = 3
auc, acc, lambda, elapsed,w = test(df,categorical_vars,numerical_vars,0.75,0.75,0.0001, lambda_vals, seed)
print("AUC: ",auc," Accuracy: ",acc, " λ: ", lambda, " Time: ", elapsed)

AUC: 0.9219504830917874 Accuracy: 0.8658536585365854 λ: 0.01 Time: 361.72257113456726

In [1136]:
maximum(w)

3.0625289434121163

In [1137]:
df = CSV.read("Data/framingham.csv";header=true)
size(df)

(3658, 16)

In [1138]:
categorical_vars = Symbol.(["education"])
numerical_vars = Symbol.(["age" ;"cigsPerDay";"totChol";"sysBP";"diaBP";"BMI";"heartRate";"glucose"])
lambda_vals = [0.00001,0.0001,0.0005,0.001,0.005,0.01,0.05,0.1]
df = preprocess(df, categorical_vars, numerical_vars)
n,p=size(df)

(3658, 18)

In [1139]:
seed = 1
auc, acc, lambda,elapsed,w = test(df,categorical_vars,numerical_vars, 0.75, 0.75,0.0001, lambda_vals,seed)
print("AUC: ",auc," Accuracy: ",acc, " λ: ", lambda, " Time: ", elapsed)

AUC: 0.8933895907323683 Accuracy: 0.8273224043715847 λ: 0.001 Time: 227.89117217063904

In [1140]:
seed = 2
auc, acc, lambda,elapsed = test(df,categorical_vars,numerical_vars, 0.75, 0.75,0.0001, lambda_vals,seed)
print("AUC: ",auc," Accuracy: ",acc, " λ: ", lambda, " Time: ", elapsed)

AUC: 0.8835407884949168 Accuracy: 0.8185792349726776 λ: 1.0e-5 Time: 229.15803718566895

In [1141]:
seed = 3
auc, acc, lambda,elapsed = test(df,categorical_vars,numerical_vars, 0.75, 0.75,0.0001, lambda_vals,seed)
print("AUC: ",auc," Accuracy: ",acc, " λ: ", lambda, " Time: ", elapsed)

AUC: 0.9007934540044631 Accuracy: 0.8360655737704918 λ: 1.0e-5 Time: 228.73048520088196

In [1142]:
df = CSV.read("Data/default of credit card clients.csv";header=true)
n,p = size(df)

(30000, 25)

In [1143]:
propertynames(df[1,vcat(4:5)])

2-element Array{Symbol,1}:
 :EDUCATION
 :MARRIAGE 

In [1144]:
categorical_vars = propertynames(df[1,vcat(4:5)])
numerical_vars = propertynames(df[1,vcat([2,6],7:24)])
lambda_vals = [0.00001,0.0001,0.0005,0.001,0.005,0.01,0.05]
df = preprocess(df, categorical_vars, numerical_vars)
n,p=size(df)

(30000, 32)

In [1145]:
seed = 1
auc, acc, lambda,elapsed,w = test(df,categorical_vars,numerical_vars, 0.75, 0.75,0.0001, lambda_vals,seed)
print("AUC: ",auc," Accuracy: ",acc, " λ: ", lambda, " Time: ", elapsed)

Enter cross-validation
1
Number of constraints: 500	 Error = 3034.147460961999
Number of constraints: 1000	 Error = 2038.3273531629804
Number of constraints: 1500	 Error = 757.9373835279533
Number of constraints: 2000	 Error = 1099.0112429838846
Number of constraints: 2500	 Error = 1068.8599842678923
Number of constraints: 3000	 Error = 1037.9689351804584
Number of constraints: 3500	 Error = 1446.651848430472
Number of constraints: 4000	 Error = 1085.485219207882
Number of constraints: 4500	 Error = 656.6829614483964
Number of constraints: 5000	 Error = 727.3618853526762
Number of constraints: 5500	 Error = 1871.9293495737795
Number of constraints: 6000	 Error = 538.0591961111122
Number of constraints: 6500	 Error = 576.5444218938436
Number of constraints: 7000	 Error = 491.20520704892573
Number of constraints: 7500	 Error = 483.3426844158958
Number of constraints: 8000	 Error = 572.726774638979
Number of constraints: 8500	 Error = 475.50508825754514
Number of constraints: 9000	 Error 

InterruptException: InterruptException:

In [1146]:
w

17-element Array{Float64,1}:
  0.7564803178300495 
  0.6510781125823433 
  0.05577898804498388
  0.25542433043219676
  0.25844451713654576
  0.886531016636607  
  0.328422611020675  
  0.30110156571079594
 -0.3114629158611582 
 -0.10614015823341487
  0.16820402648139052
 -0.27178104345394566
  0.366384084231571  
  0.11367239852267316
 -0.17436851270711196
 -4.9729848625886035 
 -5.615896549915456  