<a href="https://colab.research.google.com/github/dcolinmorgan/grph/blob/main/metagenomic_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tutorial: Quick metagenomic analysis using GPU UMAP analysis & visualization

Using GPU-accelerated UMAP analysis & visualization, metagenomic samples can be compared faster and much more easily explored.

*   Task: Analyze metagenomic samples for similarity
*   Data: 10 samples
*   [data]((https://figshare.scilifelab.se/articles/dataset/Metagenomic_dataset_from_Swedish_urban_lakes/22270225?file=39602290))
*   [paper](https://pubmed.ncbi.nlm.nih.gov/15560821/)

**Insight/ Result:**

over XXx faster for entire ~10000 cell samples (102s vs 18s)
Offers more insight when static plot would otherwise fail
(See also: CPU baseline)

# Setup

(install cuda packages first)

In [None]:
!pip install --extra-index-url=https://pypi.nvidia.com cuml-cu11 cudf-cu11 cugraph-cu11 pylibraft_cu11 raft_dask_cu11 dask_cudf_cu11 pylibcugraph_cu11 pylibraft_cu11

!pip install biopython

!pip install -U --force git+https://github.com/graphistry/pygraphistry.git@cudf
!pip install -U git+https://github.com/graphistry/cu-cat.git@DT


# import /configure

get a free api-key at https://www.graphistry.com/


In [None]:
import pandas as pd
import cuml,cudf
print(cuml.__version__)

import graphistry

graphistry.register(api=3,protocol="https", server="hub.graphistry.com", username='dcolinmorgan', password='fXjJnkE3Gik6BWy') ## key id, secret key

# graphistry.register(api=3,protocol="https", server="hub.graphistry.com", username='dcolinmorgan', password='***') ## key id, secret key
graphistry.__version__

In [None]:
!nvidia-smi


# Data Download & Description

In [None]:
!wget https://figshare.scilifelab.se/ndownloader/files/39602290
!unzip 39602290
!wget https://figshare.scilifelab.se/ndownloader/files/39602299

In [None]:
!head /content/All_MAGs/Sample_101_S75_bin_1.fa

>Sample_101_S75-bin_1-k141_1338904_length_14014_cov_309.3572
AATCACGCGTACGCCCGCACCTTGAACCGCTTTGCCGCTGCCCCCACATCATCCTCACGAAAGGTACCTT
TTCATGGAAAAAATTATCAAATCCGATGCGGAATGGCGGGCCGTATTGGACCCCGTTCAATATCATGTCC
TACGGGAGTCCGGCACTGAACGCGCCTTTGCCGGCGCGCTGACCGATGAAAAGCGCGAAGGCGAATTTCG
CTGCGCCGGCTGTGAGACTGCCCTGTTTGCTTCGGACACGAAATTTGACAGCGGTTCGGGTTGGCCAAGC
TTTACCGCGCCCGCAGACAATGATGCTGTTGAAGAGCACCGCGATACATCGCACGGCATGGTCCGCATTG
AAGTGCGCTGTGCCGCATGTGAGGGGCATTTGGGCCATGTCTTCCCCGATGGGCCTGGACCGACTGGCCT
GCGTTACTGCATCAACAGCGCCGCGCTTGCATTCGATCCTGAATAACAAGGCGCTTGTCGGCGGTTACGG
GACTGGGTAACACTCGGGCCATGGCACGCGCGCGCAAGATTTCGAAAGAACGTGGCCCAATGGCAACATG
GATACTCCGCATGGTCAAAGCGGGCGTCATCGCGGCGTTGCTGGGCGTCATGGTTCTTGGCATTTTTGTC


# Read in 10 fasta to compare

In [None]:
from Bio import SeqIO
import glob,os
B=pd.DataFrame()
for i in glob.glob('/content/All_MAGs/*.fa')[0:9]:
    # j=os.path.basename(i)
    fasta_sequences = SeqIO.parse(open(i),'fasta')
    identifiers = []
    sequences = []
    for fasta in fasta_sequences:
        name, sequence = fasta.id, str(fasta.seq)
        identifiers.append(name)
        sequences.append(sequence)

    A=pd.DataFrame([identifiers,sequences]).T
    A.columns=['ID','seq']
    A.dropna(inplace=True)
    B=B.append(A)
    # A['ID']#=A.ID.str.split('-')[0:1]
# B['ID']=B['ID'].str.split('-').str[0]+'_'+B['ID'].str.split('-').str[1]#.cat()
B['ID']=B.ID.str.split('_length').str[0]
B.index=B.ID

In [None]:
B

Unnamed: 0_level_0,ID,seq
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
Sample_103_S3-bin_62-k141_110434,Sample_103_S3-bin_62-k141_110434,CGCTTTGCGCTTGGACCCTCCCTGGGTCAATGTTGTGGCGGTGTCA...
Sample_103_S3-bin_62-k141_311323,Sample_103_S3-bin_62-k141_311323,CCTTTCAGATGCAGGATATTCCTGCTGAAGACAGCGATACCTACGA...
Sample_103_S3-bin_62-k141_647860,Sample_103_S3-bin_62-k141_647860,CCCAGCCTGGTTAAAGTGAGTTGCCGTAAATGAAATTCCGAACTTG...
Sample_103_S3-bin_62-k141_133789,Sample_103_S3-bin_62-k141_133789,GTTTTAAAAAAGCTTATTTAAATCAATGACTTAGGATTCTTCTAGG...
Sample_103_S3-bin_62-k141_167329,Sample_103_S3-bin_62-k141_167329,AGCAGGGCCTCGCTCTCATCGACTGGCATATCCAAGATGTAACTAT...
...,...,...
Sample_111_S82-bin_162-k141_418176,Sample_111_S82-bin_162-k141_418176,GTGTTGTTGACCCGGATCGTCGCGCTCGCGACAGGCAGGCTGCTGG...
Sample_111_S82-bin_162-k141_649044,Sample_111_S82-bin_162-k141_649044,ACTGCCGGGTGCTTGTAGTCGTTCGCGAACACGAGCGCGCTCACCC...
Sample_111_S82-bin_162-k141_289115,Sample_111_S82-bin_162-k141_289115,ACAGCGCGAACAGGCCCAGGTTGATGCGCTGGATGATCACCATCGA...
Sample_111_S82-bin_162-k141_1633793,Sample_111_S82-bin_162-k141_1633793,GCCTGCACCGGCTCGCGGTCGGCTCGCTCTCCGGGATCGGCGCACC...


In [None]:

# B.drop(columns=['ID'],inplace=True)
g = graphistry.nodes(B)
g2=g.umap()

emb2=g2._node_embedding
g222=graphistry.nodes(emb2.reset_index(),'ID').edges(g2._edges,'_src_implicit','_dst_implicit').bind(point_x="x",point_y="y").settings(url_params={"play":0})
g222.plot()

# Compare clustering distances to family/genus labels (gold standards)

In [None]:
meta=pd.read_excel('/content/39602299')
A=meta.pplacer_taxonomy.str.split(';', expand=True)
A.index=meta.Bin_name
A

Unnamed: 0_level_0,0,1,2,3,4,5,6
Bin_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Sample_101_S75_bin_1,d__Bacteria,p__Proteobacteria,c__Alphaproteobacteria,o__Sphingomonadales,f__Sphingomonadaceae,g__Sphingorhabdus_B,s__
Sample_101_S75_bin_10,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Flavobacteriales,f__Crocinitomicaceae,g__40-80,s__
Sample_101_S75_bin_101,d__Bacteria,p__Actinobacteriota,c__Actinomycetia,o__Nanopelagicales,f__S36-B12,g__Mxb001,s__
Sample_101_S75_bin_102,d__Bacteria,p__Cyanobacteria,c__Cyanobacteriia,o__Cyanobacteriales,f__Microcystaceae,g__,s__
Sample_101_S75_bin_103,d__Bacteria,p__Proteobacteria,c__Alphaproteobacteria,o__Rickettsiales,f__UBA1997,g__SYAR01,s__
...,...,...,...,...,...,...,...
Sample_113_S84_bin_95,d__Bacteria,p__Proteobacteria,c__Alphaproteobacteria,o__CACIAM-22H2,f__CACIAM-22H2,g__,s__
Sample_113_S84_bin_96,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Burkholderiales,f__Burkholderiaceae,g__Limnohabitans_A,s__
Sample_113_S84_bin_97,d__Bacteria,p__Cyanobacteria,c__Cyanobacteriia,o__PCC-6307,f__Cyanobiaceae,g__,s__
Sample_113_S84_bin_98,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Burkholderiales,f__Burkholderiaceae,g__Polynucleobacter,s__


In [None]:
stopppp

#try #2

[pull data](https://trace.ncbi.nlm.nih.gov/Traces/index.html?view=run_browser&acc=SRR6747711&display=data-access)
from [papers](https://www.sciencedirect.com/science/article/pii/S0160412019321774#ec-research-data) [and 2](https://pubs.acs.org/doi/10.1021/acs.est.8b03446)

In [None]:
# !wget https://sra-pub-run-odp.s3.amazonaws.com/sra/SRR6747711/SRR6747711
# !wget AWS	s3://sra-pub-src-5/SRR6747711/161002_I137_FCH7YT3BBXX_L1_wHAXPI035554-18_1.fq.gz

import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

# with open(path, 'rb') as f:
#   contents = f.read()

from Bio import SeqIO
import gzip

reads = []
count = 0
    # for record in SeqIO.parse(handle, "fasta"):
    #     print(record.id)
# with gzip.open("39602290", "rt") as handle:
for rec in SeqIO.parse('39602290', "fastq"):
    if count <500: # only read in the first 500 reads to avoid running out of memory
        reads.append(rec.seq)
    count = count+1

# take a look at some of the reads
reads[0:20]

#try #1

[pull metagenomic data](https://www.ncbi.nlm.nih.gov/nuccore/2496718099)


In [None]:
# !get https://sra-download.ncbi.nlm.nih.gov/traces/wgs01/wgs_aux/KH/UX/KHUX01/KHUX01.1.fsa_nt.gz
# !gunzip KHUX01.1.fsa_nt.gz



In [None]:
from Bio import SeqIO

fasta_sequences = SeqIO.parse(open('KHUX01.1.fsa_nt'),'fasta')
    name = []
    sequences = []
    for fasta in fasta_sequences:
        name, sequence = fasta.id, str(fasta.seq)
        identifiers.append(name)
        sequences.append(sequence)

In [None]:
A=pd.DataFrame([identifiers,sequences]).T
A.columns=['ID','seq']
A.dropna(inplace=True)

In [None]:
# !pip install -U --force git+https://github.com/graphistry/pygraphistry.git@cudf
!pip install graphistry[ai] --quiet

In [None]:
import graphistry

graphistry.register(api=3,protocol="https", server="hub.graphistry.com", username='dcolinmorgan', password='fXjJnkE3Gik6BWy') ## key id, secret key

# graphistry.register(api=3,protocol="https", server="hub.graphistry.com", username='dcolinmorgan', password='***') ## key id, secret key
graphistry.__version__

In [None]:
g = graphistry.nodes(A)
g.umap(engine='umap_learn').plot()