# Summary:

This notebook is for visualizing contig annotations from PROKKA

    
# Example Use Case:

In this example, the complete Shakya et al. 2013 metagenome is being compared to small, medium, and large subsamples of itself after conservative or aggressive read filtering and assembly with SPAdes or MEGAHIT. The datasets used in this example are named according to their metagenome content, relative degree of read filtering, and assembler used:

* SRR606249 = Accession number for the complete Shakya et al. 2013 metagenome
* subset50 = 50% of the complete Shakya et al. 2013 metagenome
* subset25 = 25% of the complete Shakya et al. 2013 metagenome
* subset10 = 10% of the complete Shakya et al. 2013 metagenome
* pe.trim2 = Conservative read filtering
* pe.trim30 = Aggressive read filtering
* megahit = MEGHIT assembly 
* spades = SPAdes assembly 


# Objectives:

* Annotation 
* Total number of genes
* Total number unique
* Compare samples and spit out unique

In [1]:
#Import pyupset and dependencies 
#import pyupset as pyu (Not sure that we want to use pyupset)
import matplotlib as mpl
import matplotlib.pyplot as plt
from pickle import load
import pandas as pd
import glob
%matplotlib inline

In [63]:
def concat_files(filenames):
    x = glob.glob(filenames)
    list_of_dfs = [pd.read_table(filename) for filename in x]
    for dataframe, filename in zip(list_of_dfs, x):
        dataframe['filename'] = filename
    combined_df = pd.concat(list_of_dfs, ignore_index=True)
    return combined_df
concat_files("*tsv")


Unnamed: 0,locus_tag,ftype,gene,EC_number,product,filename
0,OBGPGDIN_00001,CDS,drrB_1,Daunorubicin/doxorubicin resistance ABC transp...,,SRR606249_subset25_1.trim30_megahit_.tsv
1,OBGPGDIN_00002,CDS,hypothetical protein,,,SRR606249_subset25_1.trim30_megahit_.tsv
2,OBGPGDIN_00003,CDS,hypothetical protein,,,SRR606249_subset25_1.trim30_megahit_.tsv
3,OBGPGDIN_00004,CDS,hypothetical protein,,,SRR606249_subset25_1.trim30_megahit_.tsv
4,OBGPGDIN_00005,CDS,hypothetical protein,,,SRR606249_subset25_1.trim30_megahit_.tsv
5,OBGPGDIN_00006,CDS,valS_1,6.1.1.9,Valine--tRNA ligase,SRR606249_subset25_1.trim30_megahit_.tsv
6,OBGPGDIN_00007,CDS,hypothetical protein,,,SRR606249_subset25_1.trim30_megahit_.tsv
7,OBGPGDIN_00008,CDS,hypothetical protein,,,SRR606249_subset25_1.trim30_megahit_.tsv
8,OBGPGDIN_00009,CDS,hypothetical protein,,,SRR606249_subset25_1.trim30_megahit_.tsv
9,OBGPGDIN_00010,CDS,hypothetical protein,,,SRR606249_subset25_1.trim30_megahit_.tsv


In [64]:
# Calculate the total number of genes annotated with Prokka
def calc_total_genes():
    combined_df = concat_files("*tsv")
    x = combined_df.groupby('filename').gene.count()
    y = x.to_frame()
    bingo = y.sort_values('gene',ascending=False)
    bingo
    return bingo
calc_total_genes()

Unnamed: 0_level_0,gene
filename,Unnamed: 1_level_1
SRR606249_1.trim2_megahit_.tsv,195733
SRR606249_1.trim30_megahit_.tsv,195340
SRR606249_subset50_1.trim2_megahit_.tsv,193931
SRR606249_1.trim2_spades_.tsv,192008
SRR606249_subset50_1.trim30_megahit_.tsv,191005
SRR606249_1.trim30_spades_.tsv,190777
SRR606249_subset50_1.trim30_spades_.tsv,184330
SRR606249_subset25_1.trim2_megahit_.tsv,182300
SRR606249_subset25_1.trim2_spades_.tsv,177824
SRR606249_subset25_1.trim30_megahit_.tsv,172574


In [65]:
# Calculate the total number of unique genes annotated with Prokka

def calculate_unique_genes():
    combined_df = concat_files("*tsv")
    x = combined_df.groupby('filename').gene.nunique()
    y = x.to_frame()
    bingo = y.sort_values('gene',ascending=False)
    bingo
    return bingo
calculate_unique_genes()

Unnamed: 0_level_0,gene
filename,Unnamed: 1_level_1
SRR606249_1.trim2_megahit_.tsv,94112
SRR606249_1.trim2_spades_.tsv,93457
SRR606249_1.trim30_megahit_.tsv,91514
SRR606249_1.trim30_spades_.tsv,90661
SRR606249_subset50_1.trim2_megahit_.tsv,89970
SRR606249_subset50_1.trim30_megahit_.tsv,85913
SRR606249_subset50_1.trim30_spades_.tsv,85258
SRR606249_subset25_1.trim2_megahit_.tsv,77010
SRR606249_subset25_1.trim2_spades_.tsv,76953
SRR606249_subset25_1.trim30_spades_.tsv,71905


In [66]:
# Calcuate the intersection between the unique genes in each dataset
combined_df = concat_files('*tsv')
combined_df.dropna(axis=0, inplace=True)
#combined_df.head()
g = combined_df.groupby('gene')
ug = list(set(combined_df['gene']))

In [67]:
g.get_group(ug[0])

Unnamed: 0,locus_tag,ftype,gene,EC_number,product,filename
167719,OBGPGDIN_167303,CDS,sucD_51,6.2.1.5,Succinate--CoA ligase [ADP-forming] subunit alpha,SRR606249_subset25_1.trim30_megahit_.tsv
286812,GJENEIJK_113641,CDS,sucD_51,6.2.1.5,Succinate--CoA ligase [ADP-forming] subunit alpha,SRR606249_subset25_1.trim2_spades_.tsv
454630,EDLHBDBE_103283,CDS,sucD_51,6.2.1.5,Succinate--CoA ligase [ADP-forming] subunit alpha,SRR606249_subset25_1.trim30_spades_.tsv
676484,AFCEBHFN_157674,CDS,sucD_51,6.2.1.5,Succinate--CoA ligase [ADP-forming] subunit alpha,SRR606249_subset25_1.trim2_megahit_.tsv
806312,EFEGHJOA_105021,CDS,sucD_51,6.2.1.5,Succinate--CoA ligase [ADP-forming] subunit alpha,SRR606249_1.trim2_spades_.tsv
1039909,DBPCJIAG_146137,CDS,sucD_51,6.2.1.5,Succinate--CoA ligase [ADP-forming] subunit alpha,SRR606249_1.trim30_megahit_.tsv
1188986,EKJEKHNL_99637,CDS,sucD_51,6.2.1.4,Succinate--CoA ligase [GDP-forming] subunit alpha,SRR606249_subset50_1.trim30_spades_.tsv
1402572,POHKDBJL_128404,CDS,sucD_51,6.2.1.5,Succinate--CoA ligase [ADP-forming] subunit alpha,SRR606249_subset50_1.trim2_megahit_.tsv
1565571,MCMCFPFJ_97208,CDS,sucD_51,6.2.1.5,Succinate--CoA ligase [ADP-forming] subunit alpha,SRR606249_1.trim30_spades_.tsv
1807780,MOCBOGLA_148137,CDS,sucD_51,6.2.1.5,Succinate--CoA ligase [ADP-forming] subunit alpha,SRR606249_1.trim2_megahit_.tsv


In [68]:
for gene in ug[0:10]:
    gene_group = g.get_group(gene)
    if len(gene_group['filename'])>1:
        filename = "genes/%s.csv"%(gene)
        gene_group[['filename', 'gene']].to_csv()

In [86]:
# Creat concatenated tsv file 
combined_df = concat_files('*tsv')
# Remove columns keeping only 'gene' and 'filename'
new_combined_df = combined_df.drop(df.columns[[0, 1, 3, 4]], axis=1)
# Drop any na values
new_combined_df.dropna(axis=0, inplace=True)
new_combined_df.head()
g = combined_df.groupby('gene')
ug = list(set(new_combined_df['gene']))

a = []
for gene in ug:
    gene_group = g.get_group(gene)
    if len(gene_group['filename'])>1:
        a.append(gene_group[['filename', 'gene']])


list

In [None]:
a = {}

for gene in gene_list:
    a[gene] = []



In [117]:
from collections import defaultdict

gene_filenames = defaultdict(list)

for line in a:
    gene_filenames[line['gene'].iloc[0]].extend(line['filename'].tolist())

In [121]:
gene_filenames

defaultdict(list,
            {'sucD_51': ['SRR606249_subset25_1.trim30_megahit_.tsv',
              'SRR606249_subset25_1.trim2_spades_.tsv',
              'SRR606249_subset25_1.trim30_spades_.tsv',
              'SRR606249_subset25_1.trim2_megahit_.tsv',
              'SRR606249_1.trim2_spades_.tsv',
              'SRR606249_1.trim30_megahit_.tsv',
              'SRR606249_subset50_1.trim30_spades_.tsv',
              'SRR606249_subset50_1.trim2_megahit_.tsv',
              'SRR606249_1.trim30_spades_.tsv',
              'SRR606249_1.trim2_megahit_.tsv',
              'SRR606249_subset50_1.trim30_megahit_.tsv'],
             'rbsA_53': ['SRR606249_1.trim2_spades_.tsv',
              'SRR606249_1.trim30_megahit_.tsv',
              'SRR606249_subset50_1.trim2_megahit_.tsv',
              'SRR606249_1.trim30_spades_.tsv',
              'SRR606249_1.trim2_megahit_.tsv',
              'SRR606249_subset50_1.trim30_megahit_.tsv'],
             'mobB_25': ['SRR606249_1.trim2_spades_.tsv',
 

In [122]:
filenames = set()
for files in gene_filenames.values():
    filenames.update(files)

In [131]:
filenames = list(filenames)

In [132]:
data = {}
for gene, files in gene_filenames.items():
    data[gene] = [file in files for file in filenames]
dense_df = pd.DataFrame.from_dict(data, orient='index', columns=filenames)
dense_df

Unnamed: 0,SRR606249_subset10_1.trim2_spades_.tsv,SRR606249_1.trim2_spades_.tsv,SRR606249_subset25_1.trim30_megahit_.tsv,SRR606249_subset50_1.trim2_megahit_.tsv,SRR606249_1.trim30_spades_.tsv,SRR606249_1.trim2_megahit_.tsv,SRR606249_subset25_1.trim30_spades_.tsv,SRR606249_1.trim30_megahit_.tsv,SRR606249_subset50_1.trim30_spades_.tsv,SRR606249_subset50_1.trim30_megahit_.tsv,SRR606249_subset10_1.trim30_spades_.tsv,SRR606249_subset10_1.trim30_megahit_.tsv,SRR606249_subset25_1.trim2_spades_.tsv,SRR606249_subset10_1.trim2_megahit_.tsv,SRR606249_subset25_1.trim2_megahit_.tsv
sucD_51,False,True,True,True,True,True,True,True,True,True,False,False,True,False,True
rbsA_53,False,True,False,True,True,True,False,True,False,True,False,False,False,False,False
mobB_25,False,True,False,True,False,True,False,False,False,False,False,False,False,False,False
groL_57,False,True,False,True,True,True,False,True,True,True,False,False,False,False,True
bamA_8,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
rpmC_20,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
lipB_18,False,True,True,True,True,True,True,True,True,True,False,False,True,False,True
arcC1_3,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
tpx_16,False,True,True,True,True,True,True,True,True,True,False,False,True,False,True
dinB_39,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False


In [127]:
pd.DataFrame.from_dict?

In [60]:
# Import glob and create a dictionary of dataframes with name 'metaG*csv' with ',' delimiter. Split the file names by 
# '_' to generate unique file names for output. 
import glob 

genus_dict={}
for file in glob.glob('*tsv'):
    df=pd.read_table(file, delimiter = "\t")
    #a = df.drop(df.columns[[0, 1]], axis=1)
    #x=file.split('tsv')[0]
    #print(x)
    #genus_dict[x]=df
df

Unnamed: 0,locus_tag,ftype,gene,EC_number,product
0,DIGCPLMF_00001,CDS,hypothetical protein,,
1,DIGCPLMF_00002,CDS,araQ_1,L-arabinose transport system permease protein ...,
2,DIGCPLMF_00003,CDS,hypothetical protein,,
3,DIGCPLMF_00004,CDS,hypothetical protein,,
4,DIGCPLMF_00005,CDS,lplJ_1,6.3.1.20,Lipoate-protein ligase LplJ
5,DIGCPLMF_00006,CDS,hypothetical protein,,
6,DIGCPLMF_00007,CDS,hypothetical protein,,
7,DIGCPLMF_00008,CDS,rpoC_1,2.7.7.6,DNA-directed RNA polymerase subunit beta'
8,DIGCPLMF_00009,CDS,spoIIM,Stage II sporulation protein M,
9,DIGCPLMF_00010,CDS,hypothetical protein,,


In [50]:
import pyupset as pyu

In [52]:
# Generate upset plot of the intersection of between data contained in column labeled 'gene'. 
pplot=pyu.plot(genus_dict, unique_keys = ['gene'], inters_size_bounds=(700, 100000))
pplot.set_size_inches(18.5, 10.5)
#pplot['figure'].savefig('meta_annotation_comparison.png').savefig('kaiju-smash-podar.pdf', dpi=100000000)

KeyboardInterrupt: 

Import image in image manipulation software like pixlr to clean up colors