In [110]:
using Statistics
using Distributions
using Random
using Combinatorics
import LinearAlgebra: dot
using CSV
using DataFrames

In [171]:
"""
    generate_random_digraph(V::Int64, p::Float64)

Generate a random directed graph with `V` vertices.
Draws from a uniform distribution over directed graphs;
every edge has probability `p` of being in the graph. 
"""
function generate_random_digraph(V::Int64, p::Float64)
    parent_sets = [convert(Vector{Bool}, rand(Distributions.Bernoulli(p), V)) for i=1:V]
    return parent_sets
end


"""
    digraph_prior(lambda::Float64, reference::Vector{Vector{Bool}})

Sample a new graph G from the density

P(G) \\propto exp(-lambda * |G \\ G'|)

Where G' is a reference graph.
"""
function digraph_prior(lambda::Float64, reference::Vector{Vector{Bool}})

    result = []
    prob = exp(-lambda) / (1.0 + exp(-lambda))
    for (i, ps) in enumerate(reference)
        new_ps = Vector{Bool}()
        for (j, parent) in enumerate(ps)
            if parent
                bp = 0.5
            else
                bp = prob
            end
            push!(new_ps, rand(Distributions.Bernoulli(bp)))
        end
        push!(result, new_ps)
    end
    return result
end


"""
    modify_digraph!(ps::Vector{Vector{Int64}}, keep::Int64, add::Int64)

Modify a given graph
"""
function modify_digraph!(ps::Vector{Vector{Bool}}, keep::Int64, add::Int64)

    # get the indices of the original edges
    ps_idx = []
    not_ps_idx = [] # and the complement
    for i=1:length(ps)
        for (j, b) in enumerate(ps[i])
            if b
                push!(ps_idx, (i,j))
            else
                push!(not_ps_idx, (i,j))
            end
        end
    end
        
    # Randomly remove edges from the graph
    n_edges = length(ps_idx)
    to_remove = Random.randperm(n_edges)[1:max(n_edges - keep, 0)]
    for rm_idx in to_remove
        (i,j) = ps_idx[rm_idx]
        ps[i][j] = false
    end
    
    # Randomly add edges (from the complement)
    n_not_edges = length(not_ps_idx)
    to_add = Random.randperm(n_not_edges)[1:add]
    for add_idx in to_add
        (i,j) = not_ps_idx[add_idx]
        ps[i][j] = true
    end
    
    return ps
end


"""
    n_coeffs(V::Int64, max_degree::Int64)

Given a number of variables (V) and a maximum polynomial degree
for the regression model, compute the number of coefficients required
by the model. (this includes the constant term)
"""
function n_coeffs(V::Int64, max_degree::Int64)
    return sum([binomial(V, i) for i=0:min(V, max_degree)])
end
    


"""
    generate_reg_coeffs(ps::Vector{Vector{Bool}}, coeff_std::Float64, regression_deg::Int64)

Given a vector of parent sets, generate a vector of regression coefficient vectors.
(This includes the constant term)
"""
function generate_reg_coeffs(ps::Vector{Vector{Bool}}, coeff_std::Float64, regression_deg::Int64)
   
    result = Vector{Vector{Float64}}()
    for v in ps
        num_vars = sum(v)
        bw = n_coeffs(num_vars, regression_deg)
        push!(result, coeff_std.*randn(bw))
    end
    
    return result
end


"""
    initialize_time_series(V)
"""
function initialize_time_series(V::Int64; init_std::Float64=1.0)
    return init_std.*randn(V)
end


"""
    compute_B_data(x_vec::Vector{Float64}, ps::Vector{Bool}, regression_deg::Int64)
"""
function compute_B_data(x_vec::Vector{Float64}, ps::Vector{Bool}, regression_deg::Int64)
   
    used_x = x_vec[ps]
    num_parents = length(used_x)
    
    B_vec = ones(n_coeffs(num_parents, regression_deg))
    
    col = 2
    for deg=1:regression_deg
        for comb in Combinatorics.combinations(used_x, deg)
            B_vec[col] = prod(comb)
            col += 1
        end
    end
    
    return B_vec
end

"""
    generate_next_timestep_data(x_vec::Vector{Float64}, parent_sets::Vector{Vector{Bool}}, regression_std::Int64)
"""
function generate_next_timestep_data(x_vec::Vector{Float64}, 
                                     parent_sets::Vector{Vector{Bool}},
                                     regression_coeffs::Vector{Vector{Float64}},
                                     regression_deg::Int64,
                                     regression_std::Float64)
    
    next_x = Vector{Float64}()
    
    for (i, ps) in enumerate(parent_sets)
        
        data_vec = compute_B_data(x_vec, ps, regression_deg)
        
        push!(next_x, dot(data_vec, regression_coeffs[i]) + regression_std*randn() )
    end
    
    return next_x
end


"""
    generate_time_series(parent_sets::Vector{Vector{Bool}}, 
                         V::Int64, T::Int64)

Create a simulated time series with `T` time steps
"""
function generate_time_series(parent_sets::Vector{Vector{Bool}},
                              T::Int64,
                              regression_coeffs::Vector{Vector{Float64}},
                              regression_deg::Int64,
                              regression_std::Float64)
    
    result = zeros(T, length(parent_sets))
    result[1,:] = initialize_time_series(length(parent_sets); init_std=regression_std)
    
    for t=2:T
        result[t,:] = generate_next_timestep_data(result[t-1,:], parent_sets,
                                                  regression_coeffs,
                                                  regression_deg,
                                                  regression_std)
    end
    
    return result
end


"""
    generate_dataset(T::Int64, N::Int64, parent_sets::Vector{Vector{Bool}},
                     coeff_std::Float64, regression_deg::Int64, regression_std::Float64)

Given a graph structure, generate some coefficients and simulate a dataset of N time series,
each time series over T timesteps.
"""
function generate_dataset(T::Int64, N::Int64,
                          parent_sets::Vector{Vector{Bool}},
                          coeff_std::Float64, 
                          regression_deg::Int64, regression_std::Float64)
    
    coeffs = generate_reg_coeffs(parent_sets, coeff_std, regression_deg)
    
    return [generate_time_series(parent_sets, T, coeffs, regression_deg, regression_std) for i=1:N]
    
end



"""
    modify_and_simulate(ref_ps::Vector{Vector{Bool}}, remove::Int64, add::Int64)

Given a reference graph, create a modified version of it and then simulate a dataset
from that modified graph.
"""
function modify_and_simulate(ref_ps::Vector{Vector{Bool}}, keep::Int64, add::Int64, 
                             T::Int64, N::Int64,
                             coeff_std::Float64, 
                             regression_deg::Int64, regression_std::Float64)
    
    ref_ps_copy = [copy(ps) for ps in ref_ps]
    modify_digraph!(ref_ps_copy, keep, add)
    
    ds = generate_dataset(T, N, ref_ps_copy, coeff_std, regression_deg, regression_std) 
    
    return ref_ps_copy, ds
end

"""
    save_dataset(dataset::Vector{Array{Float64,2}}, file_name::String)
"""
function save_dataset(dataset::Vector{Array{Float64,2}}, file_name::String)

    V = size(dataset[1],2)
    
    df = DataFrames.DataFrame(Dict([:timeseries=>Int[]; :timestep=>Int[]; 
                                   [(Symbol("var",i)=>Float64[]) for i=1:V]]))    
    
    for (i, timeseries) in enumerate(dataset)
        for t=1:size(timeseries,1)
            push!(df, [i; t; dataset[i][t,:]])
        end
    end

    CSV.write(file_name, df; delim="\t")
end
    

"""
    save_graph(parent_sets::Vector{Vector{Float64}}, file_name::String)
"""
function save_graph(parent_sets::Vector{Vector{Bool}}, file_name::String)
   
    CSV.write(file_name, 
              DataFrame(convert(Matrix{Int64}, hcat(parent_sets...)));
              delim=",", writeheader=false)
    
end

save_graph

In [79]:
ref = generate_random_digraph(20,0.1)

20-element Array{Array{Bool,1},1}:
 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]
 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
 [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
 [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0]
 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]
 [0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
 [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
 [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
 [1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0]
 [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
 [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0]
 [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [161]:
convert(Matrix{Float64}, hcat(ref...))

20×20 Array{Float64,2}:
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  1.0  0.0  0.0
 0.0  1.0  1.0  0.0  1.0  1.0  0.0  1.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  0.0  1.0  1.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  1.0
 0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     1.0  1.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  1.0  0.0  0.0  0.0  1.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0     0.0  0.0  1.0  1.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0 

In [168]:
CSV.write("dumb_graph.tsv", DataFrame(convert(Matrix{Int64}, hcat(ref...))); delim=",", writeheader=false)

"dumb_graph.tsv"

In [80]:
using PyPlot

In [111]:
keep = 20
add = 10
T = 100
N = 3
coeff_std = 1.0/length(ref)
regression_std = 1.0/sqrt(length(ref))
regression_deg = 4
regression_std = 1.0
mod_ref, ds = modify_and_simulate(ref, keep, add, T, N, coeff_std, regression_deg, regression_std)

(Array{Bool,1}[[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [172]:
save_dataset(ds, "dumb_dataset.tsv")
save_graph(mod_ref, "dumb_graph_saved.csv")

"dumb_graph_saved.csv"

In [None]:
pg = digraph_prior(3.0, ref)

In [None]:
pgc = [copy(pg_v) for pg_v in pg]

In [None]:
modify_digraph!(pgc, 1, 2)

In [None]:
convert(Vector{Bool}, rand(Distributions.Bernoulli(0.25), 20))

In [None]:
using Distributions
rand(Distributions.Bernoulli(0.25))

In [None]:
rand(Distributions.Bernoulli(0.25))

In [None]:
ls = [1,2,3,4]

In [None]:
ls[1:0]

In [None]:
randn(10)

In [None]:
using Combinatorics

In [None]:
for comb in Combinatorics.combinations([1,2,3,4], 2)
    println( prod(comb))
end

In [None]:
compute_B_data([0.1, 0.2, 0.3, 0.4, 0.5, 0.6], [true, true, true, true, false, false], 4)

In [None]:
import LinearAlgebra: dot 

dot([1,2,3],[1,2,3])

In [142]:
Dict{Symbol,Vector{Union{Int64,Float64}}}([:timeseries=>Int[]; :timestep=>Int[]])#; [ (Symbol("var",i)=>Float64[]) for i=1:3]])

Dict{Symbol,Array{Union{Float64, Int64},1}} with 2 entries:
  :timestep   => Union{Float64, Int64}[]
  :timeseries => Union{Float64, Int64}[]

In [143]:
df = DataFrames.DataFrame(Dict([:timeseries=>Int[]; :timestep=>Int[]; [ (Symbol("var",i)=>Float64[]) for i=1:3]]))

Unnamed: 0_level_0,timeseries,timestep,var1,var2,var3
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64


Unnamed: 0_level_0,timeseries,timestep,var1,var2,var3
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64
1,1.0,2.0,1.0,2.0,3.0


In [122]:
Symbol("little", 123)

:little123