## RD900 region - identification of duplication events in MTBC

**Objective:**

Identify duplication events in RD900 MTBC using available genome assemblies.

**Method:**

1. identify assemblies to use including sub-sampling MTB an Mbovis genomes by choosing best qualities
2. decide which sub sequences we want to use to detect presence of a gene in contigs
3. for each assembly:
    * make a blast database
    * blast sub-sequence(s) to it
    * store result
4. make a matrix of presence absence for each genome
5. make a species tree and use the matrix to determine parsimony

**Links:**

* https://github.com/dmnfarrell/rd900


<img src="img/pknh_orthologs.png" width=500></img>

In [1]:
import os, glob, subprocess
import urllib
from importlib import reload
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from BCBio import GFF
import numpy as np
import pandas as pd
import pylab as plt
import seaborn as sns
import tools
pd.set_option("display.width",140)

In [2]:
#samples = pd.read_csv('genomes_data.csv')
assemblies = pd.read_csv('mtb_assemblies.csv')
assemblies[-4:]

Unnamed: 0,Organism_Name,Strain,CladeID,BioSample,BioProject,Group,SubGroup,Assembly,Size (Mb),GC%,...,WGS,Scaffolds,Genes,Proteins,Release Date,Modify Date,Level,RefSeq FTP,GenBank FTP,species
6595,Mycobacterium marinum,DL240490,20835,SAMN07811432,PRJNA414948,Terrabacteria group,Actinobacteria,GCA_003431775.1,5.78055,65.8,...,PEDJ01,289.0,5234.0,4484.0,2018/08/29,2018/09/03,Scaffold,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003...,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/003...,marinum
6596,Mycobacterium marinum,DE4576,20835,SAMN07811431,PRJNA414948,Terrabacteria group,Actinobacteria,GCA_003431805.1,6.4167,65.8,...,PEDK01,133.0,5561.0,5291.0,2018/08/29,2018/09/02,Scaffold,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003...,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/003...,marinum
6597,Mycobacterium orygis,ASM638503v1,20835,SAMN11890852,PRJNA545406,Terrabacteria group,Actinobacteria,GCA_006385035.1,,,...,,,,,,,Scaffold,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/006...,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/006...,orygis
6598,Mycobacterium orygis,Morygis1.0,20835,SAMN02470820,PRJNA193095,Terrabacteria group,Actinobacteria,GCA_000353205.1,,,...,,,,,,,Scaffold,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,orygis


### Filter assemblies to use

In [14]:
filtered=assemblies[(assemblies.Level=='Complete Genome') | (~assemblies.species.isin(['mtb']))]
omit = ['PRJNA407834','PRJNA287158','PRJNA214551']
filtered = filtered[~filtered.BioProject.isin(omit)]
#remove additional rows for testing
#filtered = filtered[(filtered.species!='mtb')]
print (len(filtered),len(assemblies))
print (filtered.species.value_counts())
#print (filtered.columns)
#print (filtered.BioProject.value_counts())

254 6599
mtb           87
BCG           37
africanum     31
bovis         28
marinum       25
H37Rv          9
canettii       9
ulcerans       7
CDC            7
microti        3
pinnipedii     3
H37Ra          3
caprae         3
orygis         2
Name: species, dtype: int64


### test data fetch

In [4]:
url = assemblies.loc[0]['GenBank FTP']
link = tools.get_url_from_path(url)
name = 'GCA_000934325.3'
urllib.request.urlretrieve(link, os.path.join('assemblies', name+'.fa.gz'))

('assemblies/GCA_000934325.3.fa.gz', <email.message.Message at 0x7f81485b6160>)

### make local blast database

In [15]:
def make_blast_db(infile, out='test'):
    
    cmd = 'gunzip -c {i} | makeblastdb -in - -dbtype nucl -out blastdb/{o} -title test'.format(i=infile,o=out)
    subprocess.check_output(cmd, shell=True)
    return

#make_blast_db('assemblies/GCA_000934325.3.fa.gz')

### determine sequences to blast using sub sequences within africanum RD900 region

In [17]:
rd900seq = SeqIO.read('RD900MAF.fa',format='fasta')
pknh1 = rd900seq[783:2663]
pknh1.id = 'pknh1'
tbd2 = rd900seq[2907:5303]
tbd2.id = 'tbd2'
pknh2 = rd900seq[5314:6981]
pknh2.id='pknh2'
pro1 = rd900seq[1612:1772]
pro1.id = 'pknh1-proregion'
pknh2_sensor = rd900seq[5314:6000]
pknh2_sensor.id = 'pknh2_sensor'
print (pknh2_sensor.seq)

TACCCGTACTTGGCCCACCAGTTGTGCAGATCCTCAATGGTCGCGGGATCCCCGAAGACGTCGCTGAGCGTTAGCTTGGCCTCGTTGCTCCAGATCACGTTCGGGCGATTCTTATAGGTGCCGCACGCGATCATGCCTGCGGTCACGTCTGGGGTCTGGTTGTAATGCCAGCCATCCGGTGATGGTCCTTCACCGGGACAGTTCATCAGCTCCACGGCGGCGATATCGTCGTTGAAGGCCTGTTTCAGCTTGTCGGGATTGGCGAACAATCCATAGATGGCGCGACTTGGCCCACCCTGGTTGGTGTTTTGCCCGCAGTCGACCATCGCCACGGCGTTCACCCATATGCTGTTCGGCTTCGGCGTGGTCGGTTTACAGGTGCCGGTCGGATAGCCCGACGGCAACATGCTGAGCAGCCTGGTCTGCGGGTCGCTGGCCGGTGCTGTGGTCGGCGTTGTGGTCGCGGGTAGCGAGGTCGTTGCCGTGGTGGTGGGGGTGCCTGGGGAGGTCGCGATGTTCCGTTTTGGGTTGTCGTCCGGTCGGTTGGCGATCCAGATGCCGATGGCGCCCAACACGAGGACGAGCACGACGGCGGCGGCGACGGCCACAAAGGGCCACGGGTTCGTTTTGCGTGGGGTCTGGGCCCAGGGGCTGGGGCCGCCGGACGGCGGTGCGCCCCAGCCGCC


### loop over assemblies and get results

Here we loop over each assembly in the table and fetch the file remotely, make a blast db and 

In [None]:
def run(seqs):
    result=[]
    for i,row in filtered.iterrows():
        acc = row.Assembly
        strain = row.Strain
        species = row.species
        full = row.Organism_Name
        print (strain, acc)
        url = row['GenBank FTP']
        #get file
        filename = os.path.join('assemblies', acc+'.fa.gz')
        dbname = 'blastdb/%s.nsq' %acc
        if not os.path.exists(filename):
            link = tools.get_url_from_path(url)            
            urllib.request.urlretrieve(link, filename)
            #make blast database for this genome
            make_blast_db(filename, out=acc)
        data={}
        for seq in seqs:
            #do blast
            bl = tools.blast_sequences('blastdb/%s' %acc, seq)
            bl['pcov'] = bl.length/len(seq)
            #print (bl)
            bl = bl[bl.pcov>.50]
            if len(bl)>0:
                x = bl.iloc[0]                
                ident = x.pident
            else:
                ident = 0
            data[seq.id] = ident
        data['id'] = acc
        data['strain'] = strain
        data['species'] = species
        data['name'] = full
        result.append(data)

    result = pd.DataFrame(result)
    return result 

seqs = [pknh1,tbd2,pro1,pknh2_sensor]
result = run(seqs)
result.to_csv('rd900_region_hits.csv',index=False)

In [None]:
result = pd.read_csv('rd900_region_hits.csv')
X=result.set_index(['species','strain']).drop(columns=['name','id'])
sns.clustermap(X,cmap='gray_r',yticklabels=1,figsize=(5,20))

In [79]:
def check_folder():
    names = filtered.Assembly
    for f in glob.glob('assemblies/*.fa.gz'):
        l = os.path.basename(f).split('.')[0]
        if l not in names:
            #os.remove(f)
            print (f)
    return

#check_folder()

### Species tree with ANIs

command is `average_nucleotide_identity.py -o ANIm_out -i assemblies -m ANIm -f`

In [83]:
anim = pd.read_csv('ANIm_out/ANIm_percentage_identity.tab',sep='\t',index_col=0)

In [69]:
df = filtered[:50].set_index('Assembly').species
mapping = df.to_dict()
#labels = df['Assembly'].map(mapping)
anim.index = anim.index.map(mapping)
anim.to_csv('anim_matrix.csv')
#anim

In [None]:
sns.clustermap(anim,xticklabels=1,cmap='coolwarm',figsize=(10,10))

### Identify lineages using RD regions