In [15]:
import os
import pandas as pd
import numpy as np
import statistics

In [3]:
metadata = pd.read_csv('/home/ecutts/metadata/bacteroidetes-img-metadata.csv')
metadata.rename(columns={'Genome Size   * assembled': 'Genome Size', 'Gene Count   * assembled': 'Gene Count'}, inplace=True)
os.chdir('/home/ecutts/Bacteroidetes/PULs_MAGs/')
metadata.head()

Unnamed: 0,taxon_oid,Domain,Sequencing Status,Study Name,Genome Name / Sample Name,Sequencing Center,IMG Genome ID,Phylum,Class,Order,Family,Genus,Species,Genome Size,Gene Count
0,2806310603,Bacteria,Permanent Draft,Coastal marine microbial mats from various loc...,Phaeodactylibacter sp. SB-MAG 46,Massachusetts Institute of Technology,2806310603,Bacteroidetes,Saprospiria,Saprospirales,Haliscomenobacteraceae,Phaeodactylibacter,unclassified,6984516,5703
1,2806310602,Bacteria,Permanent Draft,Coastal marine microbial mats from various loc...,Cytophagales bacterium SB-MAG 45,Massachusetts Institute of Technology,2806310602,Bacteroidetes,Cytophagia,Cytophagales,unclassified,unclassified,unclassified,4154663,3798
2,2806310586,Bacteria,Permanent Draft,Coastal marine microbial mats from various loc...,Muricauda sp. SB-MAG 15,Massachusetts Institute of Technology,2806310586,Bacteroidetes,Flavobacteriia,Flavobacteriales,Flavobacteriaceae,Muricauda,unclassified,3261612,3113
3,2806310581,Bacteria,Permanent Draft,Coastal marine microbial mats from various loc...,Cyclobacteriaceae bacterium SB-MAG 6,Massachusetts Institute of Technology,2806310581,Bacteroidetes,Cytophagia,Cytophagales,Cyclobacteriaceae,unclassified,unclassified,5868605,4784
4,2806310580,Bacteria,Permanent Draft,Coastal marine microbial mats from various loc...,Fulvivirga sp. SB-MAG 5,Massachusetts Institute of Technology,2806310580,Bacteroidetes,Cytophagia,Cytophagales,Fulvivirgaceae,Fulvivirga,unclassified,4204804,3892


In [4]:
pul_patterns = dict.fromkeys(os.listdir())
puls = dict.fromkeys(os.listdir())

pul_patterns_head = ['genome', 'pulid', 'contigid', 'start', 'end', 'pattern']
puls_head = ['genome', 'pulid', 'protein_id', 'contig', 'start', 'end', 'strand',
       'dist', 'protein_name', 'sus', 'hmm', 'active']

In [5]:
for mag in os.listdir():
    try:
        pul_patterns[mag] = pd.read_table(os.path.join(mag, mag + '.puls.sum.tsv'))
    except:
        pul_patterns[mag] = pd.DataFrame(columns=pul_patterns_head)
        print('No PUL in ' + mag)
    try:
        puls[mag] = pd.read_table(os.path.join(mag, mag + '.puls.tsv'))
    except:
        puls[mag] = pd.DataFrame(columns=puls_head)

No PUL in 2806310609
No PUL in 2806310621
No PUL in 2808607019


# PUL summary
Calculate the number of PUL in each MAG

In [6]:
ids = puls.keys()
MAGs = [metadata[metadata['taxon_oid'] == int(mag)]['Genome Name / Sample Name'].tolist()[0].split('-')[1] for mag in ids]
classes = [metadata[metadata['taxon_oid'] == int(mag)]['Class'].tolist()[0] for mag in ids]
genome_length = [metadata[metadata['taxon_oid'] == int(mag)]['Genome Size'].tolist()[0] for mag in ids]
counts = [len(pul_patterns[mag]) for mag in ids]
pul_genes = [len(puls[mag])for mag in ids]

# make dataframe to contain summary with basic info pre-populated
summary = pd.DataFrame({'id': ids, 'MAG' : MAGs, 'class': classes, 'assembly length': genome_length, 'PUL ct': counts, 'PUL genes': pul_genes})

# calculate bp in PUL
pul_bp = []
for mag in ids:
    df = pul_patterns[mag]
    bp = 0
    for index, row in df.iterrows():
        bp += abs(row['end'] - row['start'])
    pul_bp.append(bp)
    
summary['PUL bp'] = pul_bp

# calculate # loner susCD
loner_ct = []
for mag in ids:
    df = pul_patterns[mag]
    ct = 0
    for index, row in df.iterrows():
        if row['pattern'] == 'susD-susC' or row['pattern'] == 'susC-susD':
            ct += 1
    loner_ct.append(ct)
summary['lone susCD'] = loner_ct
    
# get percent of assembly in PUL
summary['% assembly in PUL'] = summary['PUL bp'] / genome_length * 100

# calculate percent genes in PUL
genes = [metadata[metadata['taxon_oid'] == int(mag)]['Gene Count'].tolist()[0] for mag in ids]
summary['% genes in PUL'] = summary['PUL ct'] / genes * 100

# calculate percent PUL =  loner susCD
summary['% PUL lone susCD'] = loner_ct / summary['PUL ct'] * 100 

summary

Unnamed: 0,id,MAG,class,assembly length,PUL ct,PUL genes,PUL bp,lone susCD,% assembly in PUL,% genes in PUL,% PUL lone susCD
0,2806310567,MAG 4,Cytophagia,5173205,8,42,77045,4,1.489309,0.17567,50.0
1,2806310580,MAG 5,Cytophagia,4204804,6,39,69917,1,1.662789,0.154162,16.666667
2,2806310581,MAG 6,Cytophagia,5868605,60,281,575954,26,9.814155,1.254181,43.333333
3,2806310583,MAG 11,Saprospiria,6182174,15,49,106869,9,1.728664,0.304075,60.0
4,2806310586,MAG 15,Flavobacteriia,3261612,6,33,52442,3,1.607855,0.19274,50.0
5,2806310602,MAG 45,Cytophagia,4154663,21,81,162094,13,3.901496,0.552923,61.904762
6,2806310603,MAG 46,Saprospiria,6984516,11,56,102525,7,1.46789,0.192881,63.636364
7,2806310609,MAG 55,Saprospiria,5506875,0,0,0,0,0.0,0.0,
8,2806310610,MAG 56,Cytophagia,5856607,29,76,142237,22,2.428659,0.50655,75.862069
9,2806310613,MAG 62,Saprospiria,4246127,5,11,23534,4,0.554246,0.121036,80.0


In [7]:
# Display summary from most PUL to least
summary.sort_values(by='PUL ct', ascending = False)

Unnamed: 0,id,MAG,class,assembly length,PUL ct,PUL genes,PUL bp,lone susCD,% assembly in PUL,% genes in PUL,% PUL lone susCD
2,2806310581,MAG 6,Cytophagia,5868605,60,281,575954,26,9.814155,1.254181,43.333333
16,2808607015,MAG 14,Bacteroidia,6192519,35,112,238889,23,3.857703,0.675676,65.714286
8,2806310610,MAG 56,Cytophagia,5856607,29,76,142237,22,2.428659,0.50655,75.862069
5,2806310602,MAG 45,Cytophagia,4154663,21,81,162094,13,3.901496,0.552923,61.904762
3,2806310583,MAG 11,Saprospiria,6182174,15,49,106869,9,1.728664,0.304075,60.0
18,2806310565,MAG 1,Cytophagia,3952841,11,44,88019,7,2.226728,0.318841,63.636364
6,2806310603,MAG 46,Saprospiria,6984516,11,56,102525,7,1.46789,0.192881,63.636364
0,2806310567,MAG 4,Cytophagia,5173205,8,42,77045,4,1.489309,0.17567,50.0
19,2806310566,MAG 2,Cytophagia,4036174,7,30,56568,4,1.401525,0.199886,57.142857
1,2806310580,MAG 5,Cytophagia,4204804,6,39,69917,1,1.662789,0.154162,16.666667


In [22]:
def mmm(col):
    mean = str(np.mean(col))
    med = str(statistics.median(col))
    return mean, med


print(mmm(summary['PUL ct']))
print(mmm(summary['lone susCD']))
print(mmm(summary['% PUL lone susCD']))
print(mmm(summary['% assembly in PUL']))

('11.25', '6.0')
('6.6', '4.0')
('64.97431574510681', '64.67532467532467')
('1.6931666762815145', '1.4347075676708596')


In [None]:
# genome size is calculated in number of BASE PAIRS. So % of genome coding for PULs is... 
# how to calculate? talk to Tanja about this. 