In [11]:
# imports for ctgene
import pandas as pd
import numpy as np

# imports for plotting
import toyplot
import toyplot.png

In [12]:
# ctgene function
def ctgene(df, genelist):
    temp = dict.fromkeys(genelist)
    testdigits = ['0','1','2','3','4','5','6','7','8','9']
    
    for key in temp.keys():
        temp[key] = dict.fromkeys(set(df['Shortname']), 0)   
        
    for index, row in df.iterrows():
        for gene in genelist:
            gene_higherdigits = [gene.lower() + i for i in testdigits]
            not_a_digit_error = True
            for i in gene_higherdigits:
                    if i in row['Gene Name'].lower():
                        not_a_digit_error = False
            if gene.lower() in row['Gene Name'].lower() and not_a_digit_error:
                temp[gene][row['Shortname']] += 1
    countdf = pd.DataFrame(temp) #easy to read format but bad for plotting
    countdf.reset_index(inplace=True)
    countdf.rename(columns={'index':'Shortname'}, inplace=True)
    return countdf

In [13]:
def get_MAGids(df):
    return [i.split('SB-')[1] for i in df['Genome Name']]

# renames columns in output dfs from JGI IMG so that they match
# adds MAG Id column ('MAG ##')


def prep_IMG_blast(taxondf, blastdf):
    taxondf = taxondf.rename(columns={'Genome Name / Sample Name' : 'Genome Name', \
                    'IMG Genome ID ': 'Genome Id'})
    taxondf['MAG Id'] = get_MAGids(taxondf)
    blastdf['MAG Id'] = get_MAGids(blastdf)

    shortnames = [''] * len(blastdf['Genome Name'])
    
    for index, row in blastdf.iterrows():
        MAGclass = taxondf[taxondf['Genome Id'] == row['Genome Id']]['Class'].tolist()[0]
        shortnames[index] = MAGclass + ' ' + row['MAG Id']
    
    blastdf['Shortname'] = shortnames 
    
    neatdf = pd.concat([blastdf['Gene Name'], blastdf['Genome Name'], \
                        blastdf['Genome Id'], blastdf['MAG Id'], blastdf['Shortname']], axis=1) 
    
    return (taxondf, blastdf, neatdf)
# rename 'Genome Name / Sample Name' column to match rawdf's 'Genome Name' column

In [9]:
blastdf = pd.read_csv('Data/bacteroidetes-MAGs-blast-bfragilis-conserved-riboprots.csv')
taxondf = pd.read_csv('Data/bacteroidetes-MAGs-taxontable.csv')
taxondf, blastdf, df = prep_IMG_blast(taxondf, blastdf)

In [10]:
genelist = pd.read_csv('Data/conserved-riboprots-list.csv', header=None)[0].tolist()

In [11]:
countsdf = ctgene(df, genelist)
counts = countsdf.to_numpy()
countsmatrix = np.asmatrix(counts[:,1:])
rowlabels = np.append([''], counts[:,0])
collabels = np.append([''],countsdf.columns.tolist()[1:])

In [403]:
canvas = toyplot.Canvas(width=1500, height=600, style={'background-color': 'white'})
table = canvas.table(
    rows=counts.shape[0] + 1,
    columns=counts.shape[1],
    margin=0,
    bounds=(20, 1480, 20, 580),
)

for ridx in range(counts.shape[0]):
    for cidx in range(1,counts.shape[1]):
        cell = table.cells.cell[ridx+1, cidx]
        if counts[ridx, cidx] == 1:
            cell.style = {
                "fill": toyplot.color.css("gray"),
            }
        elif counts[ridx, cidx] > 1:
            cell.style = {
                'fill': toyplot.color.css("black")
            }
            
table.body.gaps.columns[:] = 3
table.body.gaps.rows[:] = 3
table.cells.column[0].data = rowlabels
table.cells.column[0].width = 140
table.cells.column[0].align = 'right'
table.cells.row[0].data = collabels

In [404]:
toyplot.png.render(canvas, 'riboprots-presence-absence.png')