In [3]:
using Pkg
include("./P2H_CapacityExpansion.jl")
cd("/cluster/home/danare/git")
Pkg.activate(".")
using .P2H_CapacityExpansion
using DataFrames
using Parameters
using Flux
using Surrogates
using ScikitLearn
using LinearAlgebra, Random, Statistics
using JuMP
using XLSX
using PlotlyJS
using Clustering
using CSV
using Dates
using StatsBase, MultivariateStats

[32m[1m  Activating[22m[39m project at `~/git`


# Read the Data

In [4]:
file = "/cluster/home/danare/git/P2H_CapacityExpansion/results/aggregated_results/500_scenarios.txt"
file = "/cluster/home/danare/git/P2H_CapacityExpansion/results/500_scenarios_V3.txt"

"/cluster/home/danare/git/P2H_CapacityExpansion/results/500_scenarios_V3.txt"

# Split the Data into Training and Test

In [5]:
df_raw = P2H_CapacityExpansion.read_txt_file(file);
df_raw = select(df_raw, Not(:ENS))
X_train, y_train, X_test, y_test = P2H_CapacityExpansion.partitionTrainTest(df_raw, [:Cost,:Generation, :Emission], 0.7)

([10.0 16.0 … 23.0 0.0; 6.0 13.0 … 23.0 0.0; … ; 0.0 6.0 … 28.0 0.0; 18.0 424.0 … 83.0 0.0], [1532.0 43025.0 1.12446958e8; 3055.0 46093.0 1.27946124e8; … ; 1963.0 57969.0 1.51583049e8; 1257.0 40193.0 0.0], [32.0 36.0 … 22.0 0.0; 34.0 44.0 … 23.0 0.0; … ; 0.0 9.0 … 38.0 0.0; 12.0 15.0 … 23.0 0.0], [1385.0 44664.0 1.52371092e8; 1305.0 43858.0 1.40664468e8; … ; 1185.0 43358.0 9.1344118e7; 1624.0 43308.0 1.24961636e8])

In [6]:
### scale the data ###
X_train_scaled, μX, σX  = P2H_CapacityExpansion.scaling(X_train)
X_test_scaled = (X_test .- μX) ./ σX
y_train_scaled, μy, σy  = P2H_CapacityExpansion.scaling(y_train)
    
# remove np.nan #
for i in eachindex(X_test_scaled)
    if isnan(X_test_scaled[i])
        X_test_scaled[i] = 0.0
    end
end

# Iterate though the Models

In [21]:
models = Dict(
    #"RandomForest" => P2H_CapacityExpansion.random_forest_sklearn,
    #"DecisionTree" => P2H_CapacityExpansion.decision_tree_sklearn,
    "LinearRegression" => P2H_CapacityExpansion.linear_regression_sklearn,
    "NeuralNetwork" => P2H_CapacityExpansion.simple_neural_network_sklearn,
    #"GaussianProcesses" => P2H_CapacityExpansion.gaussian_process,
    #"SVR" => P2H_CapacityExpansion.svr_sklearn,
);

In [8]:
function kmeans_subset(X, n)
    R = kmeans(Matrix(X)', n) 
    idx = [findfirst(==(i), R.assignments) for i in 1:n]
    return idx
end

kmeans_subset (generic function with 1 method)

In [9]:
stochastic_models = Set(["RandomForest", "NeuralNetwork", "DecisionTree"]);

In [22]:
step_size = 100
df_full = DataFrame(Method=String[],  Value=Float64[], Size=Int[]);
n_runs = 3

for n ∈ 100:step_size:size(df_raw)[1] 
    
    # clustering the entire subspace to identify n samples
    idx = kmeans_subset(select(df_raw, Not(:Cost)), n)
    df = df_raw[idx,:]

    ### split the data into test and training ###
    X_train, y_train, X_test, y_test = P2H_CapacityExpansion.partitionTrainTest(df, [:Cost, :Emission, :Generation], 0.8)

    ### scale the data ###
    X_train_scaled, μX, σX  = P2H_CapacityExpansion.scaling(X_train)
    X_test_scaled = (X_test .- μX) ./ σX
    y_train_scaled, μy, σy  = P2H_CapacityExpansion.scaling(y_train)
    
    # remove np.nan #
    for i in eachindex(X_test_scaled)
        if isnan(X_test_scaled[i])
            X_test_scaled[i] = 0.0
        end
    end

    ### train ML model and compute R2 ### 
    for (name, fun) ∈ models
        println(name)
        # determine average for non-deterministic models
        iter = name ∈ stochastic_models ? n_runs : 1
        
        r2 = 0
        for k ∈ 1:iter
            sg = fun(X_train_scaled, y_train_scaled, X_test_scaled)
            ŷ_rescaled = sg.prediction .* σy .+ μy
            r2 += P2H_CapacityExpansion.r2_score(y_test, ŷ_rescaled)
        end

        ### add to the df ### 
        push!(df_full, (Method = name, Value = r2/iter, Size = size(X_train)[1]))
    end 
end

NeuralNetwork

Iteration 68, loss = 0.04678217
Iteration 69, loss = 0.04612555
Iteration 70, loss = 0.04548684
Iteration 71, loss = 0.04485768
Iteration 72, loss = 0.04423961
Iteration 73, loss = 0.04363117
Iteration 74, loss = 0.04303329
Iteration 75, loss = 0.04245425
Iteration 76, loss = 0.04188239
Iteration 77, loss = 0.04132197
Iteration 78, loss = 0.04076942
Iteration 79, loss = 0.04022826
Iteration 80, loss = 0.03970065
Iteration 81, loss = 0.03918445
Iteration 82, loss = 0.03867562
Iteration 83, loss = 0.03817742
Iteration 84, loss = 0.03769026
Iteration 85, loss = 0.03720477
Iteration 86, loss = 0.03672549
Iteration 87, loss = 0.03625366
Iteration 88, loss = 0.03578939
Iteration 89, loss = 0.03533138
Iteration 90, loss = 0.03487438
Iteration 91, loss = 0.03442659
Iteration 92, loss = 0.03398742
Iteration 93, loss = 0.03355855
Iteration 94, loss = 0.03313404
Iteration 95, loss = 0.03271315
Iteration 96, loss = 0.03229692
Iteration 97, loss = 0.03188413
Iteration 98, loss = 0.03

In [None]:
traces = AbstractTrace[]  # Correct type for individual traces
sort!(df_full, [:Method, :Size], rev=true)

for m in unique(df_full.Method)
    subdf = filter(:Method => ==(m), df_full)
    trace = scatter(
        x = subdf.Size,
        y = subdf.Value,
        mode = "lines+markers",
        name = string(m)
    )
    push!(traces, trace)
end

# Define the layout
layout = Layout(
    title = "Value vs Size by Method",
    xaxis_title = "Training Data",
    yaxis_title = "Accuracy"
)

# Create a single Plot from traces and layout
plt = plot(traces, layout)

# Show the plot
display(plt)

## Sub-sampling strategies

In [None]:
files_list = filter(f -> endswith(f, ".txt"), readdir(dir, join=true));

In [None]:
step_size = 50
df_full = DataFrame(Method=String[],  Value=Float64[], Size=Int[]);


### ALTERNATIVE 1: RANDOM SAMPLING ### 
for n in 140:step_size:size(df_raw)[1] #unique(ceil(Int, size(df_raw)[1]/n) for n in 140:step_size:size(df_raw)[1])
    df = df_raw[StatsBase.sample(1:nrow(df_raw), n; replace=false), :]
end

### ALTERNATIVE 2: EQUAL SELECTION SAMPLING ### 
for n in unique(ceil(Int, size(df_raw)[1]/n) for n in 140:step_size:size(df_raw)[1])
    df = df_raw[1:n:end, :]
    df = select(df, names(df)[[sum(df[!, col]) != 0 for col in names(df)]])
end

### ALTERNATIVE 3: LHS ### 
for f in files_list
    df = P2H_CapacityExpansion.read_txt_file(f)
end



# Read, normalize and transpose the df

In [None]:
df_raw = P2H_CapacityExpansion.read_txt_file(file);
df_raw = select(df_raw, Not(:Cost))
m = Matrix(df_raw)
x_norm, μ, σ = P2H_CapacityExpansion.scaling(m)
X = x_norm'

# Derive PCs

In [None]:
# generate PCA model
model = fit(PCA, X; maxoutdim=3)

In [None]:
# transpose the data back again 
X_transform = MultivariateStats.transform(model, X)

In [None]:
df_pca =   DataFrame(permutedims(X_transform), :auto)
df_pca.Cost = df_raw.Cost

In [None]:
https://www.reddit.com/r/deeplearning/comments/14vnfe8/how_to_decrease_high_loss_values/
https://discourse.julialang.org/t/how-to-efficiently-and-precisely-fit-a-function-with-neural-networks/73726
https://stackoverflow.com/questions/59153248/why-is-my-neural-network-stuck-at-high-loss-value-after-the-first-epochs


In [None]:
df_full = DataFrame(Method=String[],  Value=Float64[], Size=Int[]);

df = P2H_CapacityExpansion.read_txt_file(file);

### split the data into test and training ###
X_train, y_train, X_test, y_test = P2H_CapacityExpansion.partitionTrainTest(df, :Cost, 0.7)

### scale the data ###
X_train_scaled, μX, σX  = P2H_CapacityExpansion.scaling(X_train)
X_test_scaled = (X_test .- μX) ./ σX

    # remove np.nan #
for i in eachindex(X_test_scaled)
    if isnan(X_test_scaled[i])
        X_test_scaled[i] = 0.0
    end
end
y_train_scaled, μy, σy  = P2H_CapacityExpansion.scaling(y_train)

### train ML model and compute R2 ### 
for (name,fun) ∈ models
    sg = fun(X_train_scaled, y_train_scaled, X_test_scaled)
    ŷ_rescaled = sg.prediction .* σy .+ μy
    r2 = P2H_CapacityExpansion.r2_score(y_test, ŷ_rescaled)

    ### add to the df ### 
    push!(df_full, (Method = name, Value = r2, Size = size(X_train)[1]))
end


In [None]:
df_full

In [None]:
df_full = DataFrame(Method=String[],  Value=Float64[], Size=Int[]);

df = df_pca

### split the data into test and training ###
X_train, y_train, X_test, y_test = P2H_CapacityExpansion.partitionTrainTest(df, :Cost, 0.7)

### scale the data ###
X_train_scaled, μX, σX  = P2H_CapacityExpansion.scaling(X_train)
X_test_scaled = (X_test .- μX) ./ σX

# remove np.nan #
for i in eachindex(X_test_scaled)
    if isnan(X_test_scaled[i])
        X_test_scaled[i] = 0.0
    end
end
y_train_scaled, μy, σy  = P2H_CapacityExpansion.scaling(y_train)

### train ML model and compute R2 ### 
for (name,fun) ∈ models
    sg = fun(X_train_scaled, y_train_scaled, X_test_scaled)
    ŷ_rescaled = sg.prediction .* σy .+ μy
    r2 = P2H_CapacityExpansion.r2_score(y_test, ŷ_rescaled)

    ### add to the df ### 
    push!(df_full, (Method = name, Value = r2, Size = size(X_train)[1]))
end

new sampling technique
iterate more often through the neural network

In [None]:
df_full

In [None]:
# Step 3: Loop through files with index

# Parameters
techs = setdiff([key for (key, val) ∈ config["techs"] if get(val, "inv", "")  == true], [key for (key, val) ∈ config["techs"] if get(val, "tech_group", "")  == "transmission"] ) 
years = config["year"]
scenarios = 1:length(txt_files)

# Column names
columns = [:scenario, :year, :cost] ∪ Symbol.(techs)

df = DataFrame(;
    :scenario => repeat(scenarios, inner=length(years)),
    :year => repeat(years, outer=length(txt_files)),
    :cost => fill(0.0, length(txt_files)*length(years)),
)

# Add technology columns, initialized to 0.0
for tech in techs
    df[!, Symbol(tech)] = fill(0.0, length(txt_files)*length(years))
end

## fill in the values
for file in txt_files
    lines = readlines(file)
    
    i = parse(Int64, split(split(file, "/")[end], "_")[1])

    # Parse technology capacities
    for line in lines
        if occursin("TotalCapacityAnnual", line)
            g = split(line, ",")[2]   
            if g in techs
                val = parse(Float64, strip(split(line, "=")[2]))
                y = parse(Int64, split(split(line, "]")[1], ",")[end])

                # insert into the dataframe
                idx = findfirst((df.year .== y) .& (df.scenario .== i))
                df[idx, Symbol(g)] = val
            end

        
        elseif occursin("COSTvar", line)
            y = parse(Int64, line[8:12])
            val = parse(Float64, strip(split(line, "=")[2]))
            # insert into the dataframe
            idx = findfirst((df.year .== y) .& (df.scenario .== i))
            df[idx, :cost] = val
        end
    end
end


In [None]:
df_agg = df[:, Not(:year, :cost, :scenario)]
X = transpose(Matrix(df_agg))

In [None]:
ub = [1.25, 620, 460, 300, 0.06, 0.93, 20]
lb = [0.75, 420, 260, 100, 0.015, 0.56, 13]


# Number of samples
n = 3

# Latin Hypercube Sampling
scenarios = Surrogates.sample(n,lb,ub, Surrogates.LatinHypercubeSample())

# 2. TRAIN THE MODEL #

In [None]:
function perclass_splits(y, percent)
    uniq_class = unique(y)
    keep_index = []
    for class in uniq_class
        class_index = findall(y .== class)
        row_index = randsubseq(class_index, percent)
        push!(keep_index, row_index...)
    end
    return keep_index
end

In [None]:
y = df[!, :cost]

# split data between train and test
Random.seed!(1)
train_index = perclass_splits(y, 0.67)
test_index = setdiff(1:length(y), train_index)

# spit features
X_train = X[:, train_index]
X_test = X[:, test_index]

# split classes
y_train = transpose(Array{Float64}(y[train_index]))
y_test = transpose(Array{Float64}(y[test_index]))

In [None]:
model = Chain(
    Dense(12, 32, relu),
    Dense(32, 1)  # output: a single float
)

loss(x, y) = Flux.Losses.mse(model(x), y)  # or Flux.Losses.mae

In [None]:
# track parameters
ps = Flux.params(model)
 # select an optimizer
learning_rate = 0.01
opt = ADAM(learning_rate)


In [None]:
# train the model
loss_history = []

epochs = 500

for epoch in 1:epochs
    # train the model
    train!(loss, ps, [(X_train, y_train)], opt)
    # print report
    train_loss = loss(X_train, y_train)
    push!(loss_history, train_loss)
    println("Epoch = $epoch : Training loss = $train_loss")
end 

In [None]:
# Step 1: Create a combined scenario-year column
df_long[!, :scenario_year] = string.(df_long.scenario, "_", df_long.year)

# Step 2: Pivot wide: rows = technology, columns = scenario_year, values = value
df_wide = unstack(df_long, :technology, :scenario_year, :value)

# Show the result as a matrix
X = Matrix(df_wide[:, Not(:technology)])
X = coalesce.(X, 0.0)
#https://medium.com/@mandarangchekar7/a-neural-network-explained-and-implemented-in-julia-1fbfe4aaf0df


In [None]:
# make predictions
y_hat_raw = model(X_test)

In [None]:
y_hat = onecold(y_hat_raw) .- 1
y = y_test_raw
mean(y_hat .== y)

In [None]:
y_test