In [185]:
using JuMP 
using Gurobi 
using CSV 
using LinearAlgebra
using DataFrames
using Random
using Statistics
using Distributions
using MLBase
using CPUTime
using ScikitLearn
using MLDataUtils
@sk_import metrics: roc_auc_score



PyObject <function roc_auc_score at 0x149695430>

In [2]:
gurobi_env = Gurobi.Env()

Academic license - for non-commercial use only


Gurobi.Env(Ptr{Nothing} @0x00007ff98da40c00)

### Data Preparation

In [71]:
function one_hot_encode(X, names)
    X2 = deepcopy(X)
    select!(X2, Not(Symbol.(names)))
    for i in names
        vales = unique(X[i])
        for j in 1:length(vales)-1
           X2[Symbol(string(i)*"_"*string(vales[j]))] = (X[i].==vales[j])*1
        end
    end
    return X2
end

function normalize(X, names)
    X2 = deepcopy(X)
    select!(X2, Not(Symbol.(names)))
    for j in names
        X2[j] = (X[:,j] .- mean(X[:,j])) / std(X[:,j])
    end
    return X2
end

function clean(X)
    n,p = size(X)
    X2 = deepcopy(X)
    i = 0
    while i < n
        i += 1
        if "?" in X[i,:]
            X = X[1:end .!= i, :]
            i -= 1
        end
        n,_ = size(X)
    end
    return X
end     

function toNum(df, names)
    n,p = size(df)
    for name in names
        if !(isa(df[1,name], Int64) || isa(df[1,name], Float64))
            temp = zeros(n)
            for i=1:n
                temp[i] = parse(Float64,df[i,name])
            end
            df[!,name] = temp
        end
    end
    return df
end

function preprocess(df, categorical_vars, numerical_vars)
    df = clean(df)
    df = toNum(df,numerical_vars)
    df = normalize(df,numerical_vars)
    df = one_hot_encode(df[:,1:end], categorical_vars) 
    df[df[:,end].==0,end] .= -1
    return df
end

preprocess (generic function with 1 method)

### Tuning SRLR

In [116]:
function trainvalid_test_split(X, y, split_at=0.7)
    n,p = size(X)
    split = convert(Int,floor(split_at*n))
    permuted_indices = randperm(n)
    trainvalid_indices, test_indices = permuted_indices[1:split], permuted_indices[split+1:end]
    X_trainvalid, y_trainvalid = X[trainvalid_indices,:], y[trainvalid_indices]
    X_test, y_test = X[test_indices,:], y[test_indices]
    return X_trainvalid, X_test, y_trainvalid, y_test
end

function results_srlr(X, y, alpha_list, epsilon, seed)
    Random.seed!(seed)
    X_trainvalid, X_test, y_trainvalid, y_test = trainvalid_test_split(X, y, 0.8)
    
    roc_aucs = zeros(length(alpha_list))

    print("START VALIDATION\n\n")
    
    for (i,alpha) in enumerate(alpha_list)
        try
            print("CHECK ALPHA: ", alpha, "\n\n")
            (t_opt, f_opt, w_opt, z_opt, deltas) = srlr(y_trainvalid, X_trainvalid, epsilon, floor(Int, 0.9*size(X_trainvalid)[1]), alpha)

            train_index = z_opt.>0
            validation_index = z_opt.==0

            X_train = X_trainvalid[train_index,:]
            X_val = X_trainvalid[validation_index,:]
            y_train = y_trainvalid[train_index,:]
            y_val = y_trainvalid[validation_index,:]

            print("Percent class 1 in validation: ", (length(y_val)+sum(y_val))/(2*length(y_val)), "\n")
            print("Percent class 1 in train: ", (length(y_train)+sum(y_train))/(2*length(y_train)), "\n")

            pred_prob_val = 1 ./ (1 .+ exp.(-(Matrix(X_val)*w_opt).+alpha*transpose(w_opt)*w_opt))
            roc_aucs[i] = roc_auc_score(y_val ,pred_prob_val)
            print("Validation ROCAUC: ", roc_auc_score(y_val ,pred_prob_val))
        catch err
            print(alpha, " didnt work")
        end
    end
    
    print("ENDED VALIDATION\n\n")
    
    i_best = argmax(roc_aucs)
    best_alpha = alpha_list[i_best]
    
    t_final, f_final, w_final, errors = rlr(y_trainvalid, X_trainvalid, epsilon, best_alpha)
    pred_prob_test = 1 ./ (1 .+ exp.(-(Matrix(X_test)*w_final).+best_alpha*transpose(w_final)*w_final))
    pred_prob_trainvalid = 1 ./ (1 .+ exp.(-(Matrix(X_trainvalid)*w_final).+best_alpha*transpose(w_final)*w_final))
    auc_trainvalid = roc_auc_score(y_trainvalid ,pred_prob_trainvalid)
    auc_test = roc_auc_score(y_test ,pred_prob_test)
    
    return (best_alpha,auc_trainvalid,auc_test,pred_prob_test,pred_prob_trainvalid)
    
end

results_srlr (generic function with 3 methods)

### Robust Logistic Regression

In [187]:
### Utils Functions ###
function compute_∇f(w_k, y, X, λ)
    n, p = size(X)
    temp = zeros(p)
    for i in 1:n
        t = exp(y[i]*(transpose(w_k)*Array(X[i,:]))-λ*transpose(w_k)*w_k)
#         t = exp(-y[i]*(transpose(w_k)*Array(X[i,:]))+λ*transpose(w_k)*w_k)
        Δ = (1/(1+t))*(-y[i]*Array(X[i,:]) .+ 2*λ*w_k)
        temp = temp + Δ
    end
    ∇f_k = temp
    return ∇f_k
end

function rlr(y, X, ε, λ)
    errors = []
    n, p = size(X)
    w_0 = [0 for i in 1:p]
    f_0 = sum(log(1+exp(-y[i]*dot(X[i,:], w_0)+λ*transpose(w_0)*w_0)) for i=1:n)
    ∇f_0 = compute_∇f(w_0, y, X, λ)

    # Outer minimization problem
    outer_min_model = Model(solver=GurobiSolver(OutputFlag=0, gurobi_env))
    @variable(outer_min_model, t >= 0)
    @variable(outer_min_model, w[1:p])
    #@constraint(outer_min_model, [j=1:p], -1 <= w[j] <= 1)
    @constraint(outer_min_model, t >= f_0 + (dot(∇f_0, w)-dot(∇f_0, w_0)))
    @constraint(outer_min_model, [j=1:p], 10 >= w[j])
    @constraint(outer_min_model, [j=1:p], w[j] >= -10)
    @objective(outer_min_model, Min, t)
    k = 1 # Number of constraints in the final problem
    solve(outer_min_model)

    # New steps k
    t_k = getvalue(t)
    w_k = getvalue(w) 
    
    exponents_f_k = []
    
    for i in 1:n
        if log(1+exp(-y[i]*dot(X[i,:], w_k)+λ*transpose(w_k)*w_k)) == Inf
            append!(exponents_f_k, -y[i]*dot(X[i,:], w_k)+λ*transpose(w_k)*w_k)
        else
            append!(exponents_f_k, log(1+exp(-y[i]*dot(X[i,:], w_k)+λ*transpose(w_k)*w_k)))
        end
    end
    f_k = sum(exponents_f_k[i] for i=1:n)
    ∇f_k = compute_∇f(w_k, y, X, λ)
    
    while abs(f_k - t_k) >= ε # error

        push!(errors, f_k - t_k)
            
        @constraint(outer_min_model,t >= f_k +(dot(∇f_k, w)-dot(∇f_k, w_k)))
            
        k += 1
        solve(outer_min_model)
        # Updating all the values
        t_k = getvalue(t)
#         print("\n")
#         print(t_k)
        w_k = getvalue(w)
        
        exponents_f_k = []
    
        for i in 1:n
            if log(1+exp(-y[i]*dot(X[i,:], w_k)+λ*transpose(w_k)*w_k)) == Inf
                append!(exponents_f_k, -y[i]*dot(X[i,:], w_k)+λ*transpose(w_k)*w_k)
            else
                append!(exponents_f_k, log(1+exp(-y[i]*dot(X[i,:], w_k)+λ*transpose(w_k)*w_k)))
            end
        end
            
            
        f_k = sum(exponents_f_k[i] for i=1:n)

        ∇f_k = compute_∇f(w_k, y, X, λ)
         if k%100 == 0
             println("Number of constraints: ", k, "\t Error = ", abs(t_k - f_k))
#             println("f",f_k)
#             println("t",t_k)
#             println("∇f_k",∇f_k)
         end
        if k > 20000
            break
        end
    end
    push!(errors, f_k - t_k)
    return t_k, f_k, w_k, errors
end


rlr (generic function with 1 method)

### Stable Robust Logistic Regression

In [165]:
function classification_metrics(preds, actual)
    accuracy = sum(preds .== actual)/size(preds)[1]
    tpr = dot(
        (preds.==1),actual.==1
        )/(
        dot((preds.==1),actual.==1
            ) + dot(
            (preds.==-1),actual.==1)
    )
    fpr = dot(
        (preds.==1),actual.==-1
        )/ (
        dot((preds.==1),actual.==-1
            ) + dot(
            (preds.==-1),actual.==-1)
    )
    return accuracy, tpr, fpr
end

function compute_derivative(w_k, z, y, X, λ)
    
    n, p = size(X)
    
    temp = zeros(p)
    
#     for i in 1:n
#         t = exp(-y[i]*(transpose(w_k)*Array(X[i,:]))+λ*transpose(w_k)*w_k)
#     #         t = exp(-y[i]*(transpose(w_k)*Array(X[i,:]))+λ*transpose(w_k)*w_k)
#         Δ = z[i]*(1/(1+t))*t*(-y[i]*Array(X[i,:]) .+ 2*λ*w_k)
#         temp = temp + Δ
#     end
    for i in 1:n
        t = exp(y[i]*(transpose(w_k)*Array(X[i,:]))-λ*transpose(w_k)*w_k)
    #         t = exp(-y[i]*(transpose(w_k)*Array(X[i,:]))+λ*transpose(w_k)*w_k)
        Δ = z[i]*(1/(1+t))*(-y[i]*Array(X[i,:]) .+ 2*λ*w_k)
        temp = temp + Δ
    end
    derivative = temp
#     derivative = sum(
#         (
#             z[i]*(2*alpha.*w-y[i].*X[i,:]))/(
#             exp(y[i]*dot(w,X[i,:])-alpha.*dot(w,w))+1
#             ) for i in 1:n
#         )
    
    return derivative
    
end

function inner(w, y, X, k, alpha)
    
    n, p = size(X)
    
    model_inner = Model(solver=GurobiSolver(OutputFlag=0,gurobi_env))
    
    @variable(model_inner, 1 >= z[1:n] >= 0)
    
#     @constraint(model_inner, [i=1:n], 1 >= z[i])
    @constraint(model_inner, sum(z) <= k)
    
    exponents = []
    
    for i in 1:n
        expo = log(1+exp(-y[i]*dot(X[i,:], w)+ alpha*dot(w,w)))
        if expo == Inf
            expo = -y[i]*dot(X[i,:], w)+ alpha*dot(w,w)
        end
        append!(exponents, expo)
    end
    
    @objective(model_inner,
        Max,
        sum(z[i]*exponents[i] for i=1:n)
    )
    
#     print(model_inner)
#     print("\nprint inner\n")
    
#     print(w)
#     print("\n\n\n\n\n\n")
    
    solve(model_inner)
    
    optimal_z = getvalue(z)
    optimal_f = getobjectivevalue(model_inner)
    
    return optimal_z, optimal_f
    
end

function srlr(y, X, epsilon, k, alpha)
    deltas = []
    n, p = size(X)
    initialization_w = [0.0 for i in 1:p]
    initialization_z, initial_f = inner(initialization_w, y, X, k, alpha)
    initial_derivative_f = compute_derivative(initialization_w, initialization_z, y, X, alpha)
    
    model_outer = Model(solver=GurobiSolver(OutputFlag=0, gurobi_env))
    
    @variable(model_outer, t >= 0)
    @variable(model_outer, w[1:p])
    
    @constraint(
        model_outer, t >= initial_f + dot(initial_derivative_f, w)-dot(initial_derivative_f, initialization_w)
    )
#     @constraint(model_outer, [j=1:p], 10.0 >= w[j])
#     @constraint(model_outer, [j=1:p], w[j] >= -10.0)
    
    @objective(model_outer, Min, t)
    
    number_const = 1
    solve(model_outer)

    t_new = getvalue(t)
    w_new = getvalue(w)
    z_new, f_new = inner(w_new, y, X, k, alpha)

    derivative_f_new = compute_derivative(w_new, z_new, y, X, alpha)
    while abs(f_new - t_new) >= epsilon
        
        push!(deltas, f_new - t_new)
        
        @constraint(model_outer,t >= f_new + dot(derivative_f_new, w)-dot(derivative_f_new, w_new))
        
#         print(model_outer)
#         print("print outer\n")
        
        number_const += 1
        solve(model_outer)
        t_new = getvalue(t)
        w_new = getvalue(w)
        z_new, f_new = inner(w_new, y, X, k, alpha)

        derivative_f_new = compute_derivative(w_new, z_new, y, X, alpha)
        
        if number_const%100 == 0
            println("Number of constraints: ", number_const, "\t Step delta = ", abs(t_new - f_new))
        end
        
        if number_const > 100000
            break
            
        end
    end
    push!(deltas, f_new - t_new)
    return t_new, f_new, w_new, z_new, deltas
end


srlr (generic function with 1 method)

#### Validation of SRLR

In [36]:
function classification_metrics(preds, actual)
    accuracy = sum(preds .== actual)/size(preds)[1]
    tpr = dot(
        (preds.==1),actual.==1
        )/(
        dot((preds.==1),actual.==1
            ) + dot(
            (preds.==-1),actual.==1)
    )
    fpr = dot(
        (preds.==1),actual.==-1
        )/ (
        dot((preds.==1),actual.==-1
            ) + dot(
            (preds.==-1),actual.==-1)
    )
    return accuracy, tpr, fpr
end


classification_metrics (generic function with 1 method)

In [37]:
function validation_srlr(X_trainvalid, y_trainvalid, alpha_values, epsilon = 0.001, persent_traindata = 0.8)
    for alpha in alpha_values
        (t_opt, f_opt, w_opt, z_opt, deltas) = srlr(y_trainvalid, X_trainvalid, epsilon, floor(Int, persent_traindata*size(X)[1]), alpha)
        
        train_index = z_opt.>0
        validation_index = z_opt.==0
        
        X_train = X_trainvalid[train_index,:]
        X_val = X_trainvalid[validation_index,:]
        y_train = y_trainvalid[train_index,:]
        y_val = y_trainvalid[validation_index,:]
        
        
    end
end

validation_srlr (generic function with 3 methods)

### Result creation

In [222]:
df = CSV.read("framingham.csv";header=true)
lambda_vals = [0.00001,0.0001,0.0005,0.001,0.005,0.01,0.05,0.1, 0.5, 1.0]
categorical_vars = Symbol.(["education"])
numerical_vars = Symbol.(["age" ;"cigsPerDay";"totChol";"sysBP";"diaBP";"BMI";"heartRate";"glucose"])
df = preprocess(df, categorical_vars, numerical_vars)
n,p=size(df)

X = df[:,1:end-1]
y = df[:,end]

3658-element Array{Int64,1}:
 -1
 -1
  1
 -1
 -1
 -1
  1
 -1
  1
  1
  1
 -1
  1
  ⋮
 -1
 -1
  1
 -1
  1
 -1
  1
  1
 -1
 -1
 -1
 -1

In [None]:
(best_alpha_framing,
    auc_trainvalid_framing,
    auc_test_framing,
    pred_prob_test_framing,
    pred_prob_trainvalid_framing) = results_srlr(
    X, y, lambda_vals, 0.1, 1)

In [None]:
df = CSV.read("Data/caesarian.csv";header=true)
lambda_vals = [0.00001,0.0001,0.0005,0.001,0.005,0.01,0.05,0.1]
categorical_vars = Symbol.(["Delivery number" ;"Delivery time";"Blood of Pressure";"Heart Problem"])
numerical_vars = Symbol.(["Age"])
df = preprocess(df, categorical_vars, numerical_vars)
n,p=size(df)

X = df[:,1:end-1]
y = df[:,end]

In [None]:
(best_alpha_caesar,
    auc_trainvalid_caesar,
    auc_test_caesar,
    pred_prob_test_caesar,
    pred_prob_trainvalid_caesar) = results_srlr(
    X, y, lambda_vals, 0.1, 1)

In [None]:
df = CSV.read("Data/credit-screening/crx.data";header=false)
n,p=size(df)
categorical_vars = propertynames(df[1,vcat(1,4:7,9:10,12:13,16)])
numerical_vars = propertynames(df[1,vcat(2:3,8,11,14:15)])
lambda_vals = [0.0001,0.0005,0.001,0.005,0.01]
df = preprocess(df, categorical_vars, numerical_vars)
n,p=size(df)

X = df[:,1:end-1]
y = df[:,end]

In [None]:
(best_alpha_crx,
    auc_trainvalid_crx,
    auc_test_crx,
    pred_prob_test_crx,
    pred_prob_trainvalid_crx) = results_srlr(
    X, y, lambda_vals, 0.1, 1)

In [None]:
df = CSV.read("Data/default of credit card clients.csv";header=true)
n,p = size(df)
categorical_vars = propertynames(df[1,vcat(4:5)])
numerical_vars = propertynames(df[1,vcat([2,6],7:24)])
lambda_vals = [0.0001,0.001,0.01]
df = preprocess(df, categorical_vars, numerical_vars)
n,p=size(df)
X = df[:,1:end-1]
y = df[:,end]

In [None]:
(best_alpha_default,
    auc_trainvalid_default,
    auc_test_default,
    pred_prob_test_default,
    pred_prob_trainvalid_default) = results_srlr(
    X, y, lambda_vals, 0.1, 1)

In [210]:
Random.seed!(1)
X_trainvalid, X_test, y_trainvalid, y_test = trainvalid_test_split(X, y, 0.8)

(t_opt, f_opt, w_opt, z_opt, deltas) = srlr(y_trainvalid, X_trainvalid, 1.5, floor(Int, 0.8*size(X_trainvalid)[1]), 0.001)

Number of constraints: 100	 Step delta = 1.3804243676579135e7
Number of constraints: 200	 Step delta = 3.474774686579575e6
Number of constraints: 300	 Step delta = 399133.46997150284
Number of constraints: 400	 Step delta = 82270.56531339893
Number of constraints: 500	 Step delta = 3943.06429606212
Number of constraints: 600	 Step delta = 582.2833417724397
Number of constraints: 700	 Step delta = 59.78607284798511
Number of constraints: 800	 Step delta = 3.5167997023713724


(1027.1278049793152, 1028.5488086456257, [0.71438, 0.688522, 0.0863272, 0.102719, 0.233047, 0.607356, 0.387154, 0.313157, -0.323475, -0.101629, 0.12878, -0.255808, 0.357962, 0.078358, -0.139651, -4.8071, -5.22683], [0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0  …  1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], Any[14008.1, 10170.8, 2.64653e6, 9.08888e5, 8.67502e5, 2.62773e7, 8.83516e6, 2.45704e7, 2.92669e7, 2.25702e8  …  2.57718, 2.0952, 2.19086, 1.71782, 1.95051, 1.97516, 1.90864, 2.17449, 1.63181, 1.421])

In [211]:
train_index = z_opt.>0
validation_index = z_opt.==0

X_train = X_trainvalid[train_index,:]
X_val = X_trainvalid[validation_index,:]
y_train = y_trainvalid[train_index,:]
y_val = y_trainvalid[validation_index,:]

586×1 Array{Int64,2}:
 -1
 -1
 -1
 -1
 -1
 -1
 -1
 -1
 -1
 -1
 -1
 -1
 -1
  ⋮
 -1
 -1
 -1
 -1
 -1
 -1
 -1
 -1
 -1
 -1
 -1
 -1

In [212]:
(length(y_val)+sum(y_val))/(2*length(y_val))

0.0

In [213]:
(length(y_train)+sum(y_train))/(2*length(y_train))

0.5252136752136752

In [207]:
Random.seed!(1)
t_final, f_final, w_final, errors = rlr(y_trainvalid, X_trainvalid, 0.0001, 0.001)
pred_prob_test = 1 ./ (1 .+ exp.(-(Matrix(X_test)*w_final).+best_alpha*transpose(w_final)*w_final))
pred_prob_trainvalid = 1 ./ (1 .+ exp.(-(Matrix(X_trainvalid)*w_final).+best_alpha*transpose(w_final)*w_final))
auc_trainvalid = roc_auc_score(y_trainvalid ,pred_prob_trainvalid)
auc_test = roc_auc_score(y_test ,pred_prob_test)

Number of constraints: 100	 Error = 522.6399885124542
Number of constraints: 200	 Error = 55.85423274437517
Number of constraints: 300	 Error = 4.835614495877962
Number of constraints: 400	 Error = 0.6252706307452627
Number of constraints: 500	 Error = 0.03934297927935404
Number of constraints: 600	 Error = 0.0026394255153263657
Number of constraints: 700	 Error = 0.00030354265345522435


0.8978211231084795

In [208]:
auc_trainvalid

0.9005975701148776

In [209]:
auc_test

0.8978211231084795