In [2]:
import pyximport
pyximport.install()

import matplotlib.pyplot as plt
import numpy as np
import scipy.sparse as sp
import dgl
from sklearn.cluster import AgglomerativeClustering


import DRBin
import DRBin.utils
from DRBin.models import DGI, LogReg
from DRBin import process

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_cluster import radius_graph

from sklearn.preprocessing import normalize
import scipy.sparse as sp

%matplotlib inline

In [112]:
with open('/home/maog/data/urog/contigs.fna', 'rb') as filehandle:
    tnfs, contignames, lengths = DRBin.parsecontigs.read_contigs(filehandle)
rpkms = np.load('/home/maog/data/urog/abundance.npz')
rpkms=rpkms['arr_0']
vae = DRBin.encode.VAE(nsamples=rpkms.shape[1])
dataloader, mask = DRBin.encode.make_dataloader(rpkms, tnfs)
vae.trainmodel(dataloader)
latent = vae.encode(dataloader)
np.savetxt('/home/maog/data/urog/latent.txt', latent)
#latent = np.loadtxt('/home/maog/data/metahit/hit_latent.txt')

In [3]:
batch = torch.zeros(latent.shape[0])
latent = torch.tensor(latent)
edge_index = radius_graph(latent, r=10, loop=False)
u, v = edge_index[0], edge_index[1]
g = dgl.graph((u, v))
bg = dgl.to_bidirected(g)
knn_graph = bg.adj(scipy_fmt='csr')
sp.save_npz('/home/maog/data/urog/knngraph.npz', knn_graph)

In [4]:
# training params
batch_size = 1
nb_epochs = 500
patience = 20
lr = 0.001
l2_coef = 0.0
drop_prob = 0.3
hid_units = 32
sparse = True
nonlinearity = 'prelu' # special name to separate parameters
features = latent
adj = sp.load_npz('/home/maog/data/urog/knngraph.npz')

features = sp.csr_matrix(features)
features, _ = process.preprocess_features(features)
nb_nodes = features.shape[0]
ft_size = features.shape[1]
adj = process.normalize_adj(adj + sp.eye(adj.shape[0]))

if sparse:
    sp_adj = process.sparse_mx_to_torch_sparse_tensor(adj)
else:
    adj = (adj + sp.eye(adj.shape[0])).todense()

features = torch.FloatTensor(features[np.newaxis])
if not sparse:
    adj = torch.FloatTensor(adj[np.newaxis])
model = DGI(ft_size, hid_units, nonlinearity)
optimiser = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=l2_coef)

if torch.cuda.is_available():
    print('Using CUDA')
    model.cuda()
    features = features.cuda()
    if sparse:
        sp_adj = sp_adj.cuda()
    else:
        adj = adj.cuda()
b_xent = nn.BCEWithLogitsLoss()
xent = nn.CrossEntropyLoss()
cnt_wait = 0
best = 1e9
best_t = 0

for epoch in range(nb_epochs):
    model.train()
    optimiser.zero_grad()

    idx = np.random.permutation(nb_nodes)
    shuf_fts = features[:, idx, :]

    lbl_1 = torch.ones(batch_size, nb_nodes)
    lbl_2 = torch.zeros(batch_size, nb_nodes)
    lbl = torch.cat((lbl_1, lbl_2), 1)

    if torch.cuda.is_available():
        shuf_fts = shuf_fts.cuda()
        lbl = lbl.cuda()
    
    logits = model(features, shuf_fts, sp_adj if sparse else adj, sparse, None, None, None) 

    loss = b_xent(logits, lbl)

    #print('Loss:', loss)

    if loss < best:
        best = loss
        best_t = epoch
        cnt_wait = 0
        torch.save(model.state_dict(), '/home/maog/data/urog/best_dgi.pkl')
    else:
        cnt_wait += 1

    loss.backward()
    optimiser.step()

model.load_state_dict(torch.load('/home/maog/data/urog/best_dgi.pkl'))
embeds, _ = model.embed(features, sp_adj if sparse else adj, sparse, None)
embeds = embeds.squeeze(-3).cpu()
embeds = embeds.numpy()
features = features.squeeze(-3)
features = features.cpu().numpy()

#a = 0.73
#get the final vector for clustering
#X = (a * features + (1 - a) * embeds)

Using CUDA


In [113]:
np.savetxt('/home/maog/data/urog/tnfs.txt', tnfs)

In [3]:
from DRBin.my_cluster import *
from Bio import SeqIO

In [4]:
contig_length = {}
contig_id_idx = {}
contig_idx_id = {}
contigs = '/home/maog/data/skin/contigs.fna'
for record in SeqIO.parse(contigs, "fasta"):
    contig_length[record.id] = len(record.seq)
    contig_idx_id[len(contig_id_idx)] = record.id
    contig_id_idx[record.id] = len(contig_id_idx)

In [None]:
output = '/home/maog/data/skin'
res = perform_binning(output, contigs)
len(res.keys())

In [None]:
import collections
cluster = dict()
cluster = collections.defaultdict(set)
for k, v in res.items():
    for i in v:
        cluster[contig_idx_id[k]].add(contig_idx_id[i])
len(cluster.keys())

In [119]:
# This writes a .tsv file with the clusters and corresponding sequences
with open('/home/maog/data/urog/result/DRBin_cluster.tsv', 'w') as file:
    DRBin.utils.write_clusters(file, cluster)

# Only keep contigs in any filtered bin in memory
keptcontigs = set.union(*cluster.values())

with open('/home/maog/data/urog/contigs.fna', 'rb') as file:
    fastadict = DRBin.utils.loadfasta(file, keep=keptcontigs)
    
bindir = '/home/maog/data/urog/result/bins'
DRBin.utils.write_bins(bindir, cluster, fastadict, maxbins=1500)

In [None]:
# First load in the Reference
reference_path = '/home/maog/data/urog/reference.tsv'

!head $reference_path # show first 10 lines of reference file

with open(reference_path) as reference_file:
    reference = DRBin.benchmark.Reference.from_file(reference_file)

In [None]:
taxonomy_path = '/home/maog/data/urog/taxonomy.tsv'

!head $taxonomy_path # show first 10 lines of reference file

with open(taxonomy_path) as taxonomy_file:
    reference.load_tax_file(taxonomy_file)

In [122]:
with open('/home/maog/data/urog/result/DRBin_cluster.tsv') as clusters_file:
    DRBin_clusters = DRBin.utils.read_clusters(clusters_file)
    DRBin_bins = DRBin.benchmark.Binning(DRBin_clusters, reference, minsize=100000)

In [None]:
print('DRBin bins:')
for rank in DRBin_bins.summary():
    print('\t'.join(map(str, rank)))

In [130]:
with open('/home/maog/data/urog/vamb_cluster.tsv') as clusters_file:
    vamb_clusters = DRBin.utils.read_clusters(clusters_file)
    vamb_bins = DRBin.benchmark.Binning(vamb_clusters, reference, minsize=100000)

In [None]:
print('vamb bins:')
for rank in vamb_bins.summary():
    print('\t'.join(map(str, rank)))

In [144]:
recalls = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]
vambbins = []
DRBins = []
j = 2
for i in recalls:
    vambbins.append(vamb_bins.counters[j][(i, 0.95)])
    DRBins.append(DRBin_bins.counters[j][(i, 0.95)])

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

fig,ax=plt.subplots(figsize=(6.4,4.8), dpi=100)
recalls = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]
ax.plot(recalls,vambbins, label='vamb', linestyle='-', marker='o', markersize='4')
ax.plot(recalls,DRBins, label='DRBin', linestyle='-', marker='o', markersize='4')
plt.title('#input analysis', fontsize=18)
ax.set_xlabel('b', fontsize=13)
ax.set_ylabel('bins at precision = 0.95', fontsize=13)
ax.tick_params(axis='both', labelsize=11)
ax.grid(True, linestyle='-.', linewidth = 1)
legend = ax.legend(loc='best', framealpha=0.2)