In [130]:
using
CSV,
DataFrames,
DecisionTree,
Plots,
Random,
Statistics,
StatsPlots

In [199]:
function threshold_costs(actual, preds)
    "costs of all different thresholds"
    costs = []
    best_threshold = 0
    for i in range(0, 1, step=0.01)
        threshold = i
        cost = cost_function(actual, [if i > threshold 1 else 0 end for i in preds])
        push!(costs, cost)
        if cost == minimum(costs)
            best_threshold = i
        end
    end
    p1 = plot(costs, ylim=(0, maximum(costs)), label="")
    return minimum(costs), p1, best_threshold
end
threshold_costs([1,1,1,1,0,1,1,0,0,0,0], [1,1,1,1,1,1,0,0,0,0,1])

(520, Plot{Plots.GRBackend() n=1}, 0.99)

In [None]:
# data processing functions
function fill_missing_mean(df::DataFrame)
    X = df
    tmp = dropmissing(X, disallowmissing=true)
    is_numeric = [typeof(i[1]) <: Number for i in eachcol(tmp)]
    counter = 1
    for col in names(X)
        if is_numeric[counter]
            X[!, Symbol(col)] = recode(X[!, Symbol(col)], missing => mean(skipmissing(X[!, Symbol(col)])))
        end
        counter += 1
    end
    return X
end

function class_to_bool(df::DataFrame)
    X = df
    logic(x) = if x == "neg" 0 else 1 end
    X[!, :class] = logic.(X.class)
    return X
end

function min_max_scale(df::DataFrame)
    X = df
    tmp = dropmissing(X, disallowmissing=true)
    is_numeric = [typeof(i[1]) <: Number for i in eachcol(tmp)]
    counter = 1
    for col in names(X)
        if is_numeric[counter]
            X[!, Symbol(col)] = (X[!, Symbol(col)] .- minimum(X[!, Symbol(col)])) / (maximum(X[!, Symbol(col)]) - minimum(X[!, Symbol(col)]))
        end
        counter += 1
    end
    return X
end

function process_dataset(X::DataFrame)
    X |> fill_missing_mean |> class_to_bool |> min_max_scale
end

# ML functions
function train_test_split(X::DataFrame; target_col::String, seed=0, train_share=0.8)
    function partitionTrainTest(data, at=0.8, seed=0)
        if seed != 0
            Random.seed!(seed)
        end
        n = nrow(data)
        idx = shuffle(1:n)
        train_idx = view(idx, 1:floor(Int, at*n))
        test_idx = view(idx, (floor(Int, at*n)+1):n)
        data[train_idx,:], data[test_idx,:]
    end

    train,test = partitionTrainTest(X, train_share, seed)
    X_train = select(train, Not(Symbol(target_col)))
    y_train = train[!, Symbol(target_col)]
    X_test = select(test, Not(Symbol(target_col)))
    y_test = test[!, Symbol(target_col)]
    return X_train, X_test, y_train, y_test
end

function cost_function(actuals, predictions)
    cost(actual, prediction) = if actual == 1 && prediction == 0 500 elseif actual == 0 && prediction == 1 10 else 0 end
    return sum(cost.(actuals, predictions))
end

function threshold_costs(actual, preds)
    "costs of all different thresholds"
    costs = []
    best_threshold = 0
    for i in range(0, 1, step=0.01)
        threshold = i
        cost = cost_function(actual, [if i > threshold 1 else 0 end for i in preds])
        push!(costs, cost)
        if cost == minimum(costs)
            best_threshold = i
        end
    end
    p1 = plot(costs, ylim=(0, maximum(costs)), label="")
    return minimum(costs), p1, best_threshold
end
    
function cross_fold(train::DataFrame, test::DataFrame, model, n=10)
    validation_perf = []
    best_thresholds = []
    for i in 1:n
        println("Fold $i/$n")
        X_train, X_test, y_train, y_test = train_test_split(train, target_col="class")
        DecisionTree.fit!(model, convert(Matrix, X_train), convert(Array, y_train))
        preds = DecisionTree.predict(dt, convert(Matrix, X_test))
        lowest, p1, best_threshold = threshold_costs(y_test, preds)
        push!(validation_perf, lowest)
        push!(best_thresholds, best_threshold)
    end
    # training on all data
    X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(train, target_col="class", train_share=1.0)
    DecisionTree.fit!(model, convert(Matrix, X_train_final), convert(Array, y_train_final))
    
    # test set
    test_X_train, test_X_test, test_y_train, test_y_test = train_test_split(test, target_col="class", train_share=0)
    preds = DecisionTree.predict(dt, convert(Matrix, test_X_test))
    test_perf = cost_function(test_y_test, [if i > mean(best_thresholds) 1 else 0 end for i in preds])
        
    return mean(validation_perf), test_perf
end

In [None]:
training = CSV.File("data/aps_failure_training_set.csv"; missingstring="na") |> DataFrame! |> process_dataset
test = CSV.File("data/aps_failure_test_set.csv"; missingstring="na") |> DataFrame! |> process_dataset
# historical performance of algorithms
perf = CSV.file("data/performance.csv") |> DataFrame!

In [None]:
# decision tree
performance = []
max_depth = [10, 50, 100, -1]
min_samples_split = [2, 5, 20, 50]
min_samples_leaf = [1, 5, 20, 50]
min_purity_increase = [0.0, 0.001, 0.01]
for md in max_depth
    for mss in min_samples_split
        for msl in min_samples_leaf
            for mpi in min_purity_increase
                params = "decision tree, max_depth:$md, min_samples_split:$mss, min_samples_leaf:$msl, min_purity_increase:$mpi"
                println(params)
                dt = DecisionTreeRegressor(
                    max_depth = md,
                    min_samples_split = mss,
                    min_samples_leaf = msl,
                    min_purity_increase = mpi
                )
                validation_results, test_results = cross_fold(training, test, dt, 1)
                push!(performance, (
                    params,
                    validation_results,
                    test_results,
                    "all features"
                ))
            end
        end
    end
end
tmp_perf = DataFrame(performance)
names!(tmp_perf, [:desc, :val_score, :test_score, :notes])
perf = [perf; tmp_perf]
CSV.write("data/performance.csv", perf)
best_score = minimum([i[2] for i in performance])
best_params = [i[1] for i in performance if i[2] == best_score]

In [None]:
# random forest
performance = []
n_subfeatures = round(0.5*length(names(training)), digits=0)#-1
n_trees = [100]#[20, 50, 100]
partial_sampling = [0.7]#[0.7, 1]
max_depth = [-1]#[-1, 5]
min_samples_leaf = [5]#[5, 10]
min_samples_split = [10]#[2, 10]
min_purity_increase = 0.0

for nt in n_trees
    for ps in partial_sampling
        for md in max_depth
            for msl in min_samples_leaf
                for mss in min_samples_split
                    params = "random forest, n_subfeatures:$n_subfeatures, n_trees:$nt, partial_sampling:$ps, max_depth:$md, min_samples_leaf:$msl, min_samples_split:$mss, min_purity_increase:$min_purity_increase"
                    println(params)
                    X_train, X_test, y_train, y_test = train_test_split(training, target_col="class", train_share=0.8)
                    rf = build_forest(y_train, convert(Matrix, X_train),
                                         n_subfeatures,
                                         nt,
                                         ps,
                                         md,
                                         msl,
                                         mss,
                                         min_purity_increase
                         )
                    preds = apply_forest(rf, convert(Matrix, X_test))
                    validation_results, p1, best_threshold = threshold_costs(y_test, preds)
                    
                    test_X_train, test_X_test, test_y_train, test_y_test = train_test_split(test, target_col="class", train_share=0)
                    preds = apply_forest(rf, convert(Matrix, test_X_test))
                    test_results = cost_function(test_y_test, [if i > best_threshold 1 else 0 end for i in preds])
                    
                    params *= ", best_threshold:$best_threshold"
                    push!(performance, (
                     params,
                     validation_results,
                     test_results,
                     "all features"
                    ))
                 end
             end
         end
     end
end
tmp_perf = DataFrame(performance)
names!(tmp_perf, [:desc, :val_score, :test_score, :notes])
perf = [perf; tmp_perf]
CSV.write("data/performance.csv", perf)
best_score = minimum([i[2] for i in performance])
best_params = [i[1] for i in performance if i[2] == best_score]

In [228]:
showall(perf[end-5:end, :])

6×4 DataFrame
│ Row │ │     │ ├─────┼
│ 1   │ │ 2   │ │ 3   │ │ 4   │ │ 5   │ │ 6   │ 

│ Row │ desc                                                                                                                                                                     │
│     │ [90mUnion{Missing, String}[39m                                                                                                                                                   │
├─────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
│ 1   │ decision tree, max_depth:-1, min_samples_split:50, min_samples_leaf:20, min_purity_increase:0.001                                                                        │
│ 2   │ decision tree, max_depth:-1, min_samples_split:50, min_samples_leaf:20, min_purity_increase:0.01                                                                         │
│ 3   │