# Summary:

This notebook is for visualizing antibiotic resistance gene tables generated by [ABRicate](https://github.com/tseemann/abricate) and [SRST2](https://github.com/katholt/srst2). 

    
# Example Use Case:

In this example, the complete Shakya et al. 2013 metagenome is being compared to small, medium, and large subsamples of itself after conservative or aggressive read filtering and assembly with SPAdes or MEGAHIT. The datasets used in this example are named according to their metagenome content, relative degree of read filtering, and assembler used where appropriate. ABRicate is appropriate for analysis of antibiotic resistance genes (ARG) in reads while is SRST is useful for analysis of ABR in contigs.

* SRR606249 = Accession number for the complete Shakya et al. 2013 metagenome
* subset50 = 50% of the complete Shakya et al. 2013 metagenome
* subset25 = 25% of the complete Shakya et al. 2013 metagenome
* subset10 = 10% of the complete Shakya et al. 2013 metagenome
* pe.trim2 = Conservative read filtering
* pe.trim30 = Aggressive read filtering
* megahit = MEGHIT assembly 
* spades = SPAdes assembly 


# Objectives:

* Create table with all of the genes found 
* Count the total number of genes found for each dataset
* Count the number of unique genes found per dataset
* Compare unique genes found using a presence/absence table
* Compare results from reads and assemblies

In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
#from pickle import load
import pandas as pd
import glob
#%matplotlib inline

In [2]:
def concat_files(filenames):
    x = glob.glob(filenames)
    list_of_dfs = [pd.read_table(filename, header = 1) for filename in x]
    for dataframe, filename in zip(list_of_dfs, x):
        dataframe['filename'] = filename
    combined_df = pd.concat(list_of_dfs, ignore_index=True)
    return combined_df
concat_files("*tab")

Unnamed: 0,#FILE,SEQUENCE,START,END,GENE,COVERAGE,COVERAGE_MAP,GAPS,%COVERAGE,%IDENTITY,DATABASE,ACCESSION,filename
0,Processing: /data/SRR606249_1.trim30.fq.gz_spa...,,,,,,,,,,,,SRR606249_1.trim30_spades_abricate.tab
1,Found 15 genes in /data/SRR606249_1.trim30.fq....,,,,,,,,,,,,SRR606249_1.trim30_spades_abricate.tab
2,/data/SRR606249_1.trim30.fq.gz_spades_output/c...,NODE_1077_length_38551_cov_20.554681,14537.0,14715.0,vat(A)_1,346-524/660,.......=/====..,2/2,26.97,75.556,resfinder,L07778,SRR606249_1.trim30_spades_abricate.tab
3,/data/SRR606249_1.trim30.fq.gz_spades_output/c...,NODE_1288_length_30418_cov_6.642328,13224.0,13332.0,oqxB_1,1168-1276/3153,.....==........,0/0,3.46,83.486,resfinder,EU370913,SRR606249_1.trim30_spades_abricate.tab
4,/data/SRR606249_1.trim30.fq.gz_spades_output/c...,NODE_14_length_460229_cov_14.415215,103641.0,105137.0,lsa(A)_2,1-1497/1497,===============,0/0,100.00,99.933,resfinder,AY58982,SRR606249_1.trim30_spades_abricate.tab
5,/data/SRR606249_1.trim30.fq.gz_spades_output/c...,NODE_16351_length_1140_cov_6.075576,1.0,581.0,blaOXA-181_1,123-703/798,..============.,0/0,72.81,81.928,resfinder,HM992946,SRR606249_1.trim30_spades_abricate.tab
6,/data/SRR606249_1.trim30.fq.gz_spades_output/c...,NODE_16931_length_1080_cov_2.175610,904.0,1025.0,aph(6)-Ic_1,454-575/801,........===....,0/0,15.23,78.689,resfinder,X01702,SRR606249_1.trim30_spades_abricate.tab
7,/data/SRR606249_1.trim30.fq.gz_spades_output/c...,NODE_26607_length_447_cov_6.528061,35.0,447.0,oqxB_1,2659-3071/3153,......../...===,4/4,13.04,75.422,resfinder,EU370913,SRR606249_1.trim30_spades_abricate.tab
8,/data/SRR606249_1.trim30.fq.gz_spades_output/c...,NODE_27739_length_407_cov_3.707386,15.0,137.0,vat(F)_1,99-221/666,..===..........,0/0,18.47,77.236,resfinder,AF170730,SRR606249_1.trim30_spades_abricate.tab
9,/data/SRR606249_1.trim30.fq.gz_spades_output/c...,NODE_28254_length_391_cov_2.407738,1.0,169.0,msr(D)_2,1296-1464/1464,.............==,0/0,11.54,100.000,resfinder,AF274302,SRR606249_1.trim30_spades_abricate.tab


In [3]:
# Calculate the total number of genes annotated with Prokka
def calc_total_genes():
    combined_df = concat_files('*tab')
    x = combined_df.groupby('filename').GENE.count()
    y = x.to_frame()
    bingo = y.sort_values('GENE',ascending=False)
    bingo
    return bingo
calc_total_genes()

Unnamed: 0_level_0,GENE
filename,Unnamed: 1_level_1
SRR606249_1.trim30_spades_abricate.tab,15
SRR606249_subset10_1.trim2_spades_abricate.tab,15
SRR606249_subset25_1.trim2_spades_abricate.tab,15
SRR606249_1.trim2_spades_abricate.tab,14
SRR606249_subset25_1.trim30_megahit_abricate.tab,14
SRR606249_subset25_1.trim30_spades_abricate.tab,14
SRR606249_1.trim30_megahit_abricate.tab,13
SRR606249_subset25_1.trim2_megahit_abricate.tab,13
SRR606249_subset10_1.trim30_spades_abricate.tab,12
SRR606249_subset50_1.trim2_spades_abricate.tab,12


In [4]:
# Calculate the total number of unique genes annotated with Prokka

def calculate_unique_genes():
    combined_df = concat_files("*tab")
    x = combined_df.groupby('filename').GENE.nunique()
    y = x.to_frame()
    bingo = y.sort_values('GENE',ascending=False)
    bingo
    return bingo
calculate_unique_genes()

Unnamed: 0_level_0,GENE
filename,Unnamed: 1_level_1
SRR606249_1.trim2_spades_abricate.tab,12
SRR606249_1.trim30_spades_abricate.tab,12
SRR606249_subset25_1.trim30_spades_abricate.tab,12
SRR606249_subset25_1.trim2_spades_abricate.tab,11
SRR606249_subset25_1.trim30_megahit_abricate.tab,11
SRR606249_1.trim30_megahit_abricate.tab,10
SRR606249_subset25_1.trim2_megahit_abricate.tab,10
SRR606249_subset50_1.trim2_spades_abricate.tab,10
SRR606249_1.trim2_megahit_abricate.tab,9
SRR606249_subset50_1.trim30_spades_abricate.tab,9


In [5]:
# Calcuate the intersection between the unique genes in each dataset
combined_df = concat_files('*tab')
combined_df.dropna(axis=0, inplace=True)
#combined_df.head()
g = combined_df.groupby('GENE')
ug = list(set(combined_df['GENE']))

In [6]:
# Creat concatenated tsv file 
combined_df = concat_files('*tab')
# Remove columns keeping only 'gene' and 'filename'
#new_combined_df = combined_df.drop(combined_df.columns[[0, 1, 3, 4]], axis=1)
# Drop any na values
combined_df.dropna(axis=0, inplace=True)
#new_combined_df.head()
g = combined_df.groupby('GENE')
ug = list(set(combined_df['GENE']))

a = []
for GENE in ug:
    gene_group = g.get_group(GENE)
    if len(gene_group['filename'])>1:
        a.append(gene_group[['filename', 'GENE']])

In [7]:
from collections import defaultdict

gene_filenames = defaultdict(list)

for line in a:
    gene_filenames[line['GENE'].iloc[0]].extend(line['filename'].tolist())

In [8]:
gene_filenames

defaultdict(list,
            {'tet(O)_3': ['SRR606249_1.trim30_spades_abricate.tab',
              'SRR606249_subset50_1.trim30_spades_abricate.tab',
              'SRR606249_subset50_1.trim2_spades_abricate.tab',
              'SRR606249_1.trim2_spades_abricate.tab'],
             'blaOXA-48_2': ['SRR606249_1.trim30_spades_abricate.tab',
              'SRR606249_subset25_1.trim2_megahit_abricate.tab',
              'SRR606249_1.trim2_megahit_abricate.tab',
              'SRR606249_subset25_1.trim30_megahit_abricate.tab',
              'SRR606249_subset50_1.trim2_spades_abricate.tab',
              'SRR606249_1.trim30_megahit_abricate.tab'],
             'tet(33)_2': ['SRR606249_subset25_1.trim2_megahit_abricate.tab',
              'SRR606249_subset25_1.trim2_spades_abricate.tab',
              'SRR606249_subset25_1.trim30_spades_abricate.tab',
              'SRR606249_subset25_1.trim30_megahit_abricate.tab'],
             'aph(6)-Ic_1': ['SRR606249_1.trim30_spades_abricate.tab',
    

In [9]:
filenames = set()
for files in gene_filenames.values():
    filenames.update(files)

In [10]:
filenames = list(filenames)

In [11]:
data = {}
for gene, files in gene_filenames.items():
    data[gene] = [file in files for file in filenames]
dense_df = pd.DataFrame.from_dict(data, orient='index', columns=filenames)
dense_df

Unnamed: 0,SRR606249_1.trim30_spades_abricate.tab,SRR606249_subset25_1.trim30_megahit_abricate.tab,SRR606249_subset25_1.trim2_spades_abricate.tab,SRR606249_subset10_1.trim2_spades_abricate.tab,SRR606249_1.trim30_megahit_abricate.tab,SRR606249_subset50_1.trim30_megahit_abricate.tab,SRR606249_subset25_1.trim30_spades_abricate.tab,SRR606249_subset10_1.trim2_megahit_abricate.tab,SRR606249_subset25_1.trim2_megahit_abricate.tab,SRR606249_subset10_1.trim30_spades_abricate.tab,SRR606249_subset50_1.trim30_spades_abricate.tab,SRR606249_subset50_1.trim2_spades_abricate.tab,SRR606249_1.trim2_megahit_abricate.tab,SRR606249_subset10_1.trim30_megahit_abricate.tab,SRR606249_subset50_1.trim2_megahit_abricate.tab,SRR606249_1.trim2_spades_abricate.tab
tet(O)_3,True,False,False,False,False,False,False,False,False,False,True,True,False,False,False,True
blaOXA-48_2,True,True,False,False,True,False,False,False,True,False,False,True,True,False,False,False
tet(33)_2,False,True,True,False,False,False,True,False,True,False,False,False,False,False,False,False
aph(6)-Ic_1,True,False,True,False,True,False,False,False,False,False,False,True,True,False,True,True
lsa(A)_2,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
msr(D)_2,True,False,True,False,True,False,True,False,False,False,True,True,True,False,False,True
cepA_1,False,True,False,True,False,False,True,True,True,True,False,False,False,True,False,False
car(A)_1,True,True,True,True,True,True,True,False,True,False,True,True,True,False,True,True
vat(B)_1,True,True,True,True,True,True,True,False,True,True,True,True,True,False,True,True
catB7_1,False,True,False,True,False,False,True,True,False,True,False,False,False,True,False,False


In [12]:
import seaborn as sns

In [13]:
#int_dense_df = dense_df.astype(int).to_csv("ant_res.txt",sep='\t')
#int_dense_df

In [14]:
#import requests

#filename = 'ant_res.txt'
#upload_url = 'http://amp.pharm.mssm.edu/clustergrammer/matrix_upload/'

#r = requests.post(upload_url, files={'file': open(filename, 'rb')})

#link = r.text

NameError: name 'r' is not defined

In [16]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets


In [17]:
import numpy as np
import pandas as pd
from clustergrammer_widget import *
np.version.version

'1.14.5'

In [18]:
# initialize network object
net = Network(clustergrammer_widget)
# load dataframe
net.load_df(dense_df)
# cluster using default parameters
net.cluster(enrichrgram=False)
# make the visualization
net.widget()

clustergrammer_widget(network='{"row_nodes": [{"name": "tet(O)_3", "ini": 18, "clust": 6, "rank": 4, "rankvar"…

In [19]:
import requests

filename = 'x.txt'
upload_url = 'http://amp.pharm.mssm.edu/clustergrammer/matrix_upload/'

r = requests.post(upload_url, files={'file': open(filename, 'rb')})

link = r.text

In [20]:
r.text

'http://amp.pharm.mssm.edu/clustergrammer/viz/5b3efbfeb119f01c5a848a30/x.txt'

In [21]:
import qgrid
qgrid 

<module 'qgrid' from '/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/qgrid/__init__.py'>

In [22]:
qgrid.show_grid(dense_df)

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [None]:
import ipywidgets as widgets

from ipywidgets import interactive

items = ['All']+sorted(dense_df['SRR606249_1.trim30_spades_abricate.tab'].unique().tolist())

def view(x=''):

    if x=='All': return dense_df

    return dense_df[dense_df['SRR606249_1.trim30_spades_abricate.tab']==x]

w = widgets.Select(options=items)

interactive(view, x=w)