# Bacillus subtilis Modulome

In [1]:
from pymodulon.core import IcaData
from pymodulon.io import *
from pymodulon.imodulondb import *
from pymodulon.util import *
import pandas as pd
from os import path

In [3]:
#ica_data = load_json_model(path.join('..','data','processed_data','bsu.json.gz'))
ica_data = load_json_model(path.join('..','data','5g_processed_data','5g_raw.json.gz'))

In [4]:
pd.set_option('display.max_colwidth', None)

table_issues, tf_issues, missing_gene_links, missing_dois = \
    imodulondb_compatibility(ica_data)

print('--Table Issues--')
display(table_issues)
print('--TF Issues--')
display(tf_issues)
print('--Missing Gene Links--')
display(missing_gene_links.values)
print('--Missing DOIs--')
display(missing_dois.values)

--Table Issues--


Unnamed: 0,Table,Missing Column,Solution
0,iModulonDB,organism,"The default, ""New Organism"", will be used."
1,iModulonDB,dataset,"The default, ""New Dataset"", will be used."
2,iModulonDB,strain,"The default, ""Unspecified"", will be used."
3,iModulonDB,publication_name,"The default, ""Unpublished Study"", will be used."
4,iModulonDB,publication_link,The publication name will not be a hyperlink.
5,iModulonDB,gene_link_db,"The default, ""External Database"", will be used."
6,iModulonDB,organism_folder,"The default, ""new_organism"", will be used."
7,iModulonDB,dataset_folder,"The default, ""new_dataset"", will be used."
8,Gene,gene_name,Locus tags (gene_table.index) will be used.
9,Gene,gene_product,Locus tags (gene_table.index) will be used.


--TF Issues--


Unnamed: 0,in_trn,has_link,has_gene


--Missing Gene Links--


array(['EQU24_RS00005', 'EQU24_RS00010', 'EQU24_RS00015', ...,
       'EQU24_RS22145', 'EQU24_RS22150', 'EQU24_RS22155'], dtype=object)

--Missing DOIs--


array(['5GB1_FM03_TR1_QC_tpm', '5GB1_FM03_TR2_QC_tpm',
       '5GB1_FM11_TR1_QC_tpm', '5GB1_FM11_TR2_QC_tpm',
       '5GB1_FM12_TR1_tpm', '5GB1_FM12_TR1_QC_tpm', '5GB1_FM12_TR2_tpm',
       '5GB1_FM12_TR2_QC_tpm', '5GB1_FM14_TR1_tpm',
       '5GB1_FM14_TR1_QC_tpm', '5GB1_FM14_TR2_tpm',
       '5GB1_FM14_TR2_QC_tpm', '5GB1_FM18_TR1_QC_tpm',
       '5GB1_FM18_TR2_tpm', '5GB1_FM18_TR2_QC_tpm', '5GB1_FM18_TR3_tpm',
       '5GB1_FM18_TR3_QC_tpm', '5GB1_FM19_TR1_tpm',
       '5GB1_FM19_TR1_QC_tpm', '5GB1_FM19_TR1_UW_tpm',
       '5GB1_FM19_TR3_tpm', '5GB1_FM19_TR3_QC_tpm',
       '5GB1_FM20_TR1_QC_tpm', '5GB1_FM20_TR2_QC_tpm',
       '5GB1_FM20_TR3_tpm', '5GB1_FM20_TR3_QC_tpm',
       '5GB1_FM20_TR3_UW_tpm', '5GB1_FM21_TR1_tpm',
       '5GB1_FM21_TR1_QC_tpm', '5GB1_FM21_TR2_tpm',
       '5GB1_FM21_TR2_QC_tpm', '5GB1_FM21_TR2_UW_tpm',
       '5GB1_FM22_TR1_tpm', '5GB1_FM22_TR1_QC_tpm', '5GB1_FM22_TR3_tpm',
       '5GB1_FM22_TR3_QC_tpm', '5GB1_FM22_TR3_UW_tpm',
       '5GB1_FM23_TR3_tpm', '5GB

## Add the iModulonDB Table

In [5]:
ica_data.imodulondb_table  = {
     'organism': 'Methylotuvimicrobium buryatense',
     'dataset': 'Lidstrom Lab',
     'strain': '5GB1',
     'publication_name': '',
     'publication_link': '',
     'gene_link_db': 'BioCyc',
     'organism_folder': 'm_buryatense',
     'dataset_folder': 'lidstrom_lab'
}

## Add the Explained Variance

In [6]:
for k in ica_data.imodulon_table.index:
    ica_data.imodulon_table.loc[k, 'exp_var'] = explained_variance(
        ica_data, imodulons=k)

## Rename some iModulon columns

In [8]:
ica_data.imodulon_table = ica_data.imodulon_table.rename({
    'category':'broad_category',
    'function':'category'
    }, axis = 1)

## Links

In [9]:
prefix = 'https://biocyc.org/gene?orgid=GCF_005931095&id='

# add gene links
for g in ica_data.gene_table.index:
    ica_data.gene_links[g] = prefix + g

# add regulator links
# for r in tf_issues.index[(tf_issues.has_gene.astype(bool))]:
#     ica_data.tf_links[r] = prefix + ica_data.name2num(r)
    
# # some regulators don't have genes but do have pages
# # here, I link to the direct page since I had to check their existence anyway
# other_regs = {
#     'SwrA':'http://subtiwiki.uni-goettingen.de/v3/gene/view/5D479874B43F521DB52EDC2C27CDE4967F22DE47',
#     'SigK':'http://subtiwiki.uni-goettingen.de/v3/gene/view/24F7FD5C7C3A68BB2760ABB8CBD8FBD65E5FF7D4',
#     'YlxR':'http://subtiwiki.uni-goettingen.de/v3/gene/view/F4097349A563503468A2A14F062AEAC532C7917A',
#     'LnrK':'http://subtiwiki.uni-goettingen.de/v3/gene/view/387EF370CE24F7A3C20789A57329A02EBED46F53',
#     'KipR':'http://subtiwiki.uni-goettingen.de/v3/gene/view/7DA9A79876C546B78B716A64706A3A3716018C2E'
# }
# for k, v in other_regs.items():
#     ica_data.tf_links[k] = v

## Sample Table

In [11]:
ica_data.sample_table = ica_data.sample_table.rename({'sample':'sample_og_name'}, axis = 1)
ica_data.sample_table['full_condition_name'] = ica_data.sample_table.apply(lambda row: f"{row['project']}_{row['exp_condition']}",axis=1)
ica_data.sample_table

Unnamed: 0,exp_theme,exp_condition,sample_og_name,shortd,longd,carbon_source,oxygen_level,nitrate_level,copper_level,lanthanum_level,growth_rate,growth_mode,include?,notes,cluster_id,project,condition,reference_condition,full_condition_name
5GB1_FM03_TR1_QC_tpm,uMax,uMax,5GB1_FM03_TR1_QC,uMax_QC,"Fermentor run 3, uMax though close to O2 limited, QC",2,1,0,3,0,2,0,1,,1,5G,uMax,uMax,5G_uMax
5GB1_FM03_TR2_QC_tpm,uMax,uMax,5GB1_FM03_TR2_QC,uMax_QC,"Fermentor run 3, uMax though close to O2 limited, QC",2,1,0,3,0,2,0,1,,3,5G,uMax,uMax,5G_uMax
5GB1_FM11_TR1_QC_tpm,lowO2_fast_growth,lowO2_fast_growth,5GB1_FM11_TR1_QC,lowO2_QC,"Fermentor run 11, O2 limited, QC",2,0,0,3,0,1,0,1,,18,5G,lowO2_fast_growth,uMax,5G_lowO2_fast_growth
5GB1_FM11_TR2_QC_tpm,lowO2_fast_growth,lowO2_fast_growth,5GB1_FM11_TR2_QC,lowO2_QC,"Fermentor run 11, O2 limited, QC",2,0,0,3,0,1,0,1,,18,5G,lowO2_fast_growth,uMax,5G_lowO2_fast_growth
5GB1_FM12_TR1_tpm,lowCH4,lowCH4,5GB1_FM12_TR1,lowCH4,"Fermentor run 12, methane limited",1,1,0,3,0,1,0,1,,17,5G,lowCH4,uMax,5G_lowCH4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5GB1C-5G-N-BR2_tpm,Lanthanum,NoLanthanum,5GB1C-5G-N-BR2,NoLa_rep2,Vial sample in mid- to late-exponential phase,2,1,0,3,0,2,1,1,,12,5G,NoLanthanum,uMax,5G_NoLanthanum
5GB1C-JG15-La-BR1_tpm,Lanthanum,WithLanthanum,5GB1C-JG15-La-BR1,deltaTBDT_La_rep1,Vial sample in mid- to late-exponential phase,2,1,0,3,1,2,1,1,,12,5G,WithLanthanum,uMax,5G_WithLanthanum
5GB1C-JG15-La-BR2_tpm,Lanthanum,WithLanthanum,5GB1C-JG15-La-BR2,deltaTBDT_La_rep2,Vial sample in mid- to late-exponential phase,2,1,0,3,1,2,1,1,,12,5G,WithLanthanum,uMax,5G_WithLanthanum
5GB1C-JG15-N-BR1_tpm,Lanthanum,NoLanthanum,5GB1C-JG15-N-BR1,deltaTBDT_NoLa_rep1,Vial sample in mid- to late-exponential phase,2,1,0,3,0,2,1,1,,12,5G,NoLanthanum,uMax,5G_NoLanthanum


In [12]:
# add replicate numbers
for name, group in ica_data.sample_table.groupby(['project', 'condition']):
    ica_data.sample_table.loc[group.index, 'replicate'] = range(1, group.shape[0]+1)
    
# make sample names that include replicate number
#ica_data.sample_table = ica_data.sample_table.rename({'full_name':'full_condition_name'}, axis = 1)
ica_data.sample_table['name'] = \
    ica_data.sample_table.full_condition_name + ':' + \
    ica_data.sample_table.replicate.astype(int).astype(str)

# important: adjust naming to this new column
ica_data.sample_table = ica_data.sample_table.rename({
    #'Sample': 'SRS_Sample',
    'name': 'sample'
    }, axis = 1)

In [14]:
# Fix some gene table column names
ica_data.gene_table = ica_data.gene_table.rename({
    'gene_symbol': 'gene_name',
    'product': 'gene_product',
    'tu_id': 'operon',
    'start_coord': 'start',
    }, axis = 1)

In [11]:
#ica_data.sample_table.to_csv('bsub_sample_table.csv')

# TODO 
# add a DOI column and then replace the existing sample_table

## Double-check Compatibility

In [15]:
table_issues, tf_issues, missing_gene_links, missing_dois = \
    imodulondb_compatibility(ica_data)

print('--Table Issues--')
display(table_issues)
print('--TF Issues--')
display(tf_issues)
print('--Missing Gene Links--')
display(missing_gene_links.values)
print('--Missing DOIs--')
display(missing_dois.values)

--Table Issues--


Unnamed: 0,Table,Missing Column,Solution
0,iModulonDB,publication_link,The publication name will not be a hyperlink.
1,Gene,regulator,"Regulator info will not display. If you have a TRN, add it to the model to auto-generate this column."
2,Sample,n_replicates,This column will be generated for you.
3,Sample,doi,Clicking on activity plot bars will not link to relevant papers for the samples.
4,iModulon,name,imodulon_table.index will be used.
5,iModulon,regulator,The regulator details will be left blank.
6,iModulon,function,"The function will be blank in the dataset table and ""Uncharacterized"" in the iModulon dashboard"
7,iModulon,category,"The categories will be filled in as ""Uncharacterized""."
8,iModulon,n_genes,This column will be computed for you.
9,iModulon,precision,This column will be left blank.


--TF Issues--


Unnamed: 0,in_trn,has_link,has_gene


--Missing Gene Links--


array([], dtype=float64)

--Missing DOIs--


array(['5GB1_FM03_TR1_QC_tpm', '5GB1_FM03_TR2_QC_tpm',
       '5GB1_FM11_TR1_QC_tpm', '5GB1_FM11_TR2_QC_tpm',
       '5GB1_FM12_TR1_tpm', '5GB1_FM12_TR1_QC_tpm', '5GB1_FM12_TR2_tpm',
       '5GB1_FM12_TR2_QC_tpm', '5GB1_FM14_TR1_tpm',
       '5GB1_FM14_TR1_QC_tpm', '5GB1_FM14_TR2_tpm',
       '5GB1_FM14_TR2_QC_tpm', '5GB1_FM18_TR1_QC_tpm',
       '5GB1_FM18_TR2_tpm', '5GB1_FM18_TR2_QC_tpm', '5GB1_FM18_TR3_tpm',
       '5GB1_FM18_TR3_QC_tpm', '5GB1_FM19_TR1_tpm',
       '5GB1_FM19_TR1_QC_tpm', '5GB1_FM19_TR1_UW_tpm',
       '5GB1_FM19_TR3_tpm', '5GB1_FM19_TR3_QC_tpm',
       '5GB1_FM20_TR1_QC_tpm', '5GB1_FM20_TR2_QC_tpm',
       '5GB1_FM20_TR3_tpm', '5GB1_FM20_TR3_QC_tpm',
       '5GB1_FM20_TR3_UW_tpm', '5GB1_FM21_TR1_tpm',
       '5GB1_FM21_TR1_QC_tpm', '5GB1_FM21_TR2_tpm',
       '5GB1_FM21_TR2_QC_tpm', '5GB1_FM21_TR2_UW_tpm',
       '5GB1_FM22_TR1_tpm', '5GB1_FM22_TR1_QC_tpm', '5GB1_FM22_TR3_tpm',
       '5GB1_FM22_TR3_QC_tpm', '5GB1_FM22_TR3_UW_tpm',
       '5GB1_FM23_TR3_tpm', '5GB

## Save & Export

In [16]:
save_to_json(ica_data, path.join('..','data','5g_processed_data','5g_imdb.json.gz'))

In [15]:
categories = [
    'Carbon Metabolism',
    'AA/Nucleotide Metabolism',
    'Misc. Metabolism',
    'Homeostasis',
    'Lifestyles',
    'Cellular Processes',
    'Stress Response',
    'Prophages',
    'Other',
    'Single Gene',
    'Uncharacterized'
]

In [23]:
ica_data.sample_table.sort_values('exp_condition')

Unnamed: 0,exp_theme,exp_condition,sample_og_name,shortd,longd,carbon_source,oxygen_level,nitrate_level,copper_level,lanthanum_level,...,growth_mode,include?,notes,cluster_id,project,condition,reference_condition,full_condition_name,replicate,sample
5GB1_LTrecycle_TR1_tpm,LanzaTech,LanzaTech,5GB1_LTrecycle_TR1,LTrecycle,LanzaTech Cell recycle,-1,-1,-1,-1,-1,...,-1,1,other exp,18,5G,LanzaTech,uMax,5G_LanzaTech,1.0,5G_LanzaTech:1
5GB1_LTrecycle_TR1_QC_tpm,LanzaTech,LanzaTech,5GB1_LTrecycle_TR1_QC,LTrecycle_QC,LanzaTech Cell recycle,-1,-1,-1,-1,-1,...,-1,1,other exp,18,5G,LanzaTech,uMax,5G_LanzaTech,2.0,5G_LanzaTech:2
5GB1_FM18_TR3_QC_tpm,MeOH,MeOH,5GB1_FM18_TR3_QC,MeOH_QC,"Fermentor run 18, methanol",0,1,0,3,0,...,0,1,,16,5G,MeOH,uMax,5G_MeOH,5.0,5G_MeOH:5
5GB1_FM23_TR3_QC_tpm,MeOH,MeOH,5GB1_FM23_TR3_QC,MeOH_QC,"Fermentor run 23, methanol",0,1,0,3,0,...,0,1,,16,5G,MeOH,uMax,5G_MeOH,7.0,5G_MeOH:7
5GB1_FM18_TR3_tpm,MeOH,MeOH,5GB1_FM18_TR3,MeOH,"Fermentor run 18, methanol",0,1,0,3,0,...,0,1,,16,5G,MeOH,uMax,5G_MeOH,4.0,5G_MeOH:4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5GB1_FM20_TR1_QC_tpm,uMax,uMax,5GB1_FM20_TR1_QC,uMax_QC,"Fermentor run 20, uMax, QC",2,1,0,3,0,...,0,1,,2,5G,uMax,uMax,5G_uMax,3.0,5G_uMax:3
5GB1_FM20_TR3_QC_tpm,uMax,uMax,5GB1_FM20_TR3_QC,uMax_QC,"Fermentor run 20, uMax, QC",2,1,0,3,0,...,0,1,,4,5G,uMax,uMax,5G_uMax,6.0,5G_uMax:6
5GB1_FM20_TR3_tpm,uMax,uMax,5GB1_FM20_TR3,uMax,"Fermentor run 20, uMax",2,1,0,3,0,...,0,1,,9,5G,uMax,uMax,5G_uMax,5.0,5G_uMax:5
5GB1_FM21_TR1_QC_tpm,uMax,uMax,5GB1_FM21_TR1_QC,uMax_QC,"Fermentor run 21, uMax, QC",2,1,0,3,0,...,0,1,,4,5G,uMax,uMax,5G_uMax,9.0,5G_uMax:9


In [17]:
pd.set_option('mode.chained_assignment',None)
imodulondb_export(ica_data, '../iModulonDB', cat_order = categories)

Writing main site files...
Done writing main site files. Writing plot files...
Two progress bars will appear below. The second will take significantly longer than the first.
Writing iModulon page files (1/2)


  0%|          | 0/72 [00:00<?, ?it/s]

Writing Gene page files (2/2)


  0%|          | 0/4325 [00:00<?, ?it/s]

Complete! (Organism = b_subtilis; Dataset = modulome)
