In [None]:
# importing library
include("src/GeneRBM.jl") # RBM
using PyPlot # matplotlib
using Distributions
using DelimitedFiles # readdlm

In [None]:
path_X = "../1000G_real_genomes/"
X = readdlm(string(path_X,"805_SNP_1000G_real.hapt"))
X = convert(Array{Float64,2},X[:,3:end])' # array start at 1

Ns = size(X,2);
Nv = size(X,1);
# number of hidden nodes
Nh = 50;

In [None]:
# for the rbm package, X should be Feat. X Samples, transpose can be obtained using X'
rbm = RBM(Nv,Nh; sigma=0.00001);
setPCD!(rbm);

In [None]:
# init the bias "on the dataset"
initBiasFromSamples(rbm,X);

In [None]:
# Save scatter plot at after each epoch
rbm.ctx["fnameParam"] = "Demo1kSig"
setSaveSc!(rbm,1)

In [None]:
t_max = 10 # number of MCMC iteration / mini-batches \in [10,100]
n_pcd = 500 # number of persistent chain \in [50,1000]
δt = 100 # frequence of showing information
n_iter = 40 # number of iterations for the algorithm
lr = 0.01 # learning rate
# fit(rbm,X; lr=0.001, n_iter=1000, batch_size=32, t_max=10,n_pcd=50,δt=100) 
# fit(rbm,X; lr=0.001, n_iter=1000, batch_size=32, t_max=5,n_pcd=500,δt=100) 
fit(rbm,X; lr=lr, n_iter=n_iter, batch_size=32, t_max=t_max,n_pcd=n_pcd,δt=δt) 

In [None]:
# plotting likelihood
plt[:plot](rbm.fe_tr,label="Approx Likelihood1")
plt[:plot](rbm.pl_tr,label="Approx Likelihood2")
plt[:legend]()
plt[:show]()

In [None]:
# sampling from random initial condition
s_v,_,_,_ = sampling(rbm,rand(rbm.Nv,5000);t_max=500)
# sampling starting from the dataset
s_v_X,_,_,_ = sampling(rbm,X;t_max=500)
# sampling starting from the persistent chain
s_v_pc,_,_,_ = sampling(rbm,X;t_max=500)

In [None]:
# computing the mean-field fixed-point from the persistent-chain
m_v_pc,_ = meanFieldIte(rbm,s_v_pc,t_max=200)

In [None]:
# computing various spectrum
ux,sx,vx = svd(X/sqrt(size(X,2)))
u1,s1,v1 = svd(s_v/sqrt(5000))
u2,s2,v2 = svd(s_v_X/sqrt(size(X,2)))
u3,s3,v3 = svd(s_v_pc/sqrt(size(s_v_pc,2)))
uu,ss,vv = svd(rbm.W)

In [None]:
# comparing the spectrum
xr = collect(1:1:size(sx,1))
plt[:loglog](xr[1:20],sx[1:20],label="data")
plt[:loglog](xr[1:20],s1[1:20],label="rdm-start")
plt[:loglog](xr[1:20],s2[1:20],label="data-start")
plt[:loglog](xr[1:20],s3[1:20],label="pc-start")
plt[:legend]()

In [None]:
# scatter on the SVD of W
scX = X'*vv;
scG = s_v'*vv;
scXG = s_v_X'*vv
scPC = rbm.p_contdiv'*vv;
scPC_fp = m_v_pc'*vv;

In [None]:
f,ax = subplots(3,3,figsize=(15,10))
ax[1,1][:hist2d](scX[:,1],scX[:,2],bins=50)
ax[1,2][:hist2d](scG[:,1],scG[:,2],bins=50)
ax[1,3][:hist2d](scXG[:,1],scXG[:,2],bins=50)
ax[1,1][:scatter](scPC[:,1],scPC[:,2],color="red", s=2)
ax[1,2][:scatter](scPC[:,1],scPC[:,2],color="red", s=2)
ax[1,3][:scatter](scPC[:,1],scPC[:,2],color="red", s=2)
ax[1,1][:scatter](scPC_fp[:,1],scPC_fp[:,2],color="green", s=10)
ax[1,2][:scatter](scPC_fp[:,1],scPC_fp[:,2],color="green", s=10)
ax[1,3][:scatter](scPC_fp[:,1],scPC_fp[:,2],color="green", s=10)
ax[2,1][:hist2d](scX[:,3],scX[:,4],bins=50)
ax[2,2][:hist2d](scG[:,3],scG[:,4],bins=50)
ax[2,3][:hist2d](scXG[:,3],scXG[:,4],bins=50)
ax[2,1][:scatter](scPC[:,3],scPC[:,4],color="red", s=2)
ax[2,2][:scatter](scPC[:,3],scPC[:,4],color="red", s=2)
ax[2,3][:scatter](scPC[:,3],scPC[:,4],color="red", s=2)
ax[2,1][:scatter](scPC_fp[:,3],scPC_fp[:,4],color="green", s=10)
ax[2,2][:scatter](scPC_fp[:,3],scPC_fp[:,4],color="green", s=10)
ax[2,3][:scatter](scPC_fp[:,3],scPC_fp[:,4],color="green", s=10)
ax[3,1][:hist2d](scX[:,5],scX[:,6],bins=50)
ax[3,2][:hist2d](scG[:,5],scG[:,6],bins=50)
ax[3,3][:hist2d](scXG[:,5],scXG[:,6],bins=50)
ax[3,1][:scatter](scPC[:,5],scPC[:,6],color="red", s=2)
ax[3,2][:scatter](scPC[:,5],scPC[:,6],color="red", s=2)
ax[3,3][:scatter](scPC[:,5],scPC[:,6],color="red", s=2)
ax[3,1][:scatter](scPC_fp[:,5],scPC_fp[:,6],color="green", s=10)
ax[3,2][:scatter](scPC_fp[:,5],scPC_fp[:,6],color="green", s=10)
ax[3,3][:scatter](scPC_fp[:,5],scPC_fp[:,6],color="green", s=10)

In [None]:
# overlap matrix between the dataset and generated data
plt[:figure](figsize=(10,10))
plt[:imshow]((2*X .- 1)'*(2*s_v_X .- 1) / 805)
plt[:colorbar]()

In [None]:
# following trajectories
t_max=500
Δt = 10
n_traj = 5
s_v_t = rand(rbm.Nv,n_traj)
s_all = zeros(t_max+1,rbm.Nv,n_traj)
s_all[1,:,:] .= s_v_t
for t=1:t_max
    s_v_t,_,_,_ = sampling(rbm,s_v_t;t_max=Δt)
    s_all[1+t,:,:] .= s_v_t
end

In [None]:
sc_all = []
for nt=1:n_traj
    push!(sc_all,s_all[:,:,nt]*vv)
end

In [None]:
# plot the trajectories above the histogram
f,ax = plt[:subplots](2,1,figsize=(10,15))
ax[1][:hist2d](scX[:,1],scX[:,2],bins=50)
for nt=1:n_traj
    ax[1][:scatter](sc_all[nt][:,1],sc_all[nt][:,2],s=2)
end

ax[2][:hist2d](scX[:,3],scX[:,4],bins=50)
for nt=1:n_traj
    ax[2][:scatter](sc_all[nt][:,3],sc_all[nt][:,4],s=2)
end
