In [1]:
import pandas as pd

In [54]:
# Read information from LC300 genome file
with open('gene_prot_aaseq.csv', 'r') as f:  # Contains annotation information from all CDS in LC300 genome
    gene_prot_aa = pd.read_csv(f, sep=',')
    f.close()

gene_prot_aa.head(15)

Unnamed: 0,Gene,Protein,Name,Protein_sequence
0,IB49_00015,AKU25180.1,hypothetical protein,MKKKRFTVAEGETIAACLARMKQEGYRPVRRIEQPIFREVETNGET...
1,IB49_00020,AKU25181.1,hypothetical protein,MMDEQESKRQFQDDLDQYRMDNVIHAPKHYVYQVGYEASSGNPTGG...
2,IB49_00030,AKU25182.1,hypothetical protein,MIHHTWATRPTIKKVKCVHTNAEKYMVSNVLTPGKVYEVKNETDEF...
3,IB49_00040,AKU25183.1,hypothetical protein,MHWLCPVFQQPNRQDAKERQHAAKPHSCAVRRQIGDFAEHDRTKRP...
4,IB49_00045,AKU25184.1,membrane associated protein,MREDFRLPPHPVYVPVTLIRDGQLLADELAELGKTEQWLAAKLQKQ...
5,IB49_00050,AKU25185.1,GntR family transcriptional regulator,MFELDIRSRQPIYEQLIDKMKEMIVRELWQPHDQLPSVRTMAKQLM...
6,IB49_00055,AKU25186.1,ABC transporter,MIQLVDVTKMFDRFAAVKGANMMVPKGAIYGLLGPNGAGKTTLLKM...
7,IB49_00065,AKU25187.1,hypothetical protein,MKLIVVWLMFALLAIGTMVSMDQLMGMTLHQSLHIVLNPFRVMKAP...
8,IB49_00070,AKU25188.1,spore gernimation protein KB,MLFQWGMGRQKKQINDRVKQEGPDHSGEAADVPQEPMSAELAVNLD...
9,IB49_00075,AKU25189.1,spore gernimation protein KC,MKRPIAMFVSFFVCAVLLAGCWSKKELTDLGVVIAVGLDKTKDGRY...


In [55]:
# Read reaction-gene information from iGEL604
with open('rxns_ec_genes.csv', 'r') as f:  # Contains all gene reaction rules + EC numbers from iGEL604
    rxns_ec_genes = pd.read_csv(f, sep=',')
    f.close()

rxns_ec_genes = rxns_ec_genes.rename(columns={'rxns':'Reaction', 'eccodes': 'EC-codes', 'genes' : 'Gene'})
rxns_ec_genes.head(15)

Unnamed: 0,Reaction,EC-codes,Gene
0,R00006,ec-code/2.2.1.6,IB49_05185 or IB49_05180
1,R00013,ec-code/4.1.1.47,IB49_05185
2,R00019,ec-code/1.12.7.2;ec-code/1.12.99.-,IB49_09315
3,R00025,ec-code/1.13.12.16,IB49_01360
4,R00026,ec-code/3.2.1.21,IB49_08540
5,R00036,ec-code/4.2.1.24,IB49_05095
6,R00066,ec-code/2.5.1.9,IB49_03270
7,R00068,ec-code/1.10.3.3,IB49_14805
8,R00078,ec-code/1.16.3.1,IB49_14805
9,R00084,ec-code/2.5.1.61,IB49_05105


In [57]:
test = rxns_ec_genes[rxns_ec_genes['Gene'].str.contains('IB49_05185',na=False)]  # Test. Finds all reactions with IB49_05185
test

Unnamed: 0,Reaction,EC-codes,Gene
0,R00006,ec-code/2.2.1.6,IB49_05185 or IB49_05180
1,R00013,ec-code/4.1.1.47,IB49_05185
33,R00207,ec-code/1.2.3.3,IB49_05185
39,R00226,ec-code/2.2.1.6,IB49_05185 or IB49_05180
548,R03145,ec-code/1.2.5.1,IB49_05185
692,R04673,ec-code/2.2.1.6,IB49_05185 or IB49_05180
954,R08648,ec-code/2.2.1.6,IB49_05185 or IB49_05180


In [72]:
# Read list of all genes from iGEL604
with open('iGEL604_genes.csv','r') as infile:
    iGEL604_genes = pd.read_csv(infile)
    infile.close()
iGEL604_genes

Unnamed: 0,Gene
0,IB49_13935
1,IB49_13225
2,IB49_02295
3,IB49_07995
4,IB49_14525
...,...
599,IB49_17550
600,IB49_11720
601,IB49_01790
602,IB49_09765


In [38]:
rxns_ec_genes['EC-codes'] = rxns_ec_genes['EC-codes'].str.replace('ec-code/', '')  # Removes 'ec-code/' from the codes
rxns_ec_genes

Unnamed: 0,Reaction,EC-codes,Gene
0,R00006,2.2.1.6,IB49_05185 or IB49_05180
1,R00013,4.1.1.47,IB49_05185
2,R00019,1.12.7.2;1.12.99.-,IB49_09315
3,R00025,1.13.12.16,IB49_01360
4,R00026,3.2.1.21,IB49_08540
...,...,...,...
1235,DNA,,IB49_17680 and IB49_05605 and IB49_16520 and I...
1236,Glycogen,,
1237,Cofactor_Pool,,
1238,Biomass,,


# Strategy for matching genes and ec codes in iGEL604 to protein names and sequences

1) Hitta subset av rader i "gene_prot_aa" där gene_prot_aa['Gene'] finns i rxns_ec_genes['Gene']
2) använd gene_prot_aa.loc[<bool-serie från 1] för att extrahera raderna

In [76]:
test = gene_prot_aa['Gene'].isin(iGEL604_genes['Gene'])
genes_in_both_dfs = gene_prot_aa.loc[test]
genes_in_both_dfs  # Finds 595 genes from the model in the extracted data

Unnamed: 0,Gene,Protein,Name,Protein_sequence
22,IB49_00155,AKU25202.1,glutamine synthetase,MSKTFVSSTQTGLLEQIKETIQQKNVELLHLQFVDIEGILKHVTVT...
32,IB49_00225,AKU25211.1,NADPH-dependent oxidoreductase,MNAVIETILRHRSIRRFEERPLTDEQIRTIVECAQAASTSSYVQAY...
34,IB49_00235,AKU25213.1,acyl-CoA dehydrogenase,MYLRLTDEQRMVQKAIRKFVEKELMPLENEVLRNEWEGKPGLAPEK...
35,IB49_00240,AKU25214.1,3-oxoacyl-ACP reductase,MNQRFAGRVAFVTGGSRGIGKAIVTRFAEEGAKVAFIDLNEEALEA...
38,IB49_00260,AKU25217.1,acetyl-CoA acetyltransferase,MKRDAVIVSAVRTAIARQGGALATLPAHIYGAEVIKEAMRRANIGP...
...,...,...,...,...
2808,IB49_18360,AKU27986.1,biotin carboxylase,MFSKVLIANRGEIAVRIIRTCQKLGIRTVAVYSEADADSLHVSLAD...
2809,IB49_18365,AKU27987.1,acetyl-CoA carboxylase,MTQVTATMAGSVWKLLVAVGDHVEEGQDVIILESMKMEIPIAAEAS...
2810,IB49_18375,AKU27988.1,enoyl-CoA hydratase,MSALVSFETQENGIAIVTLNRPEAANALSRALLFELGALFQEIKFR...
2811,IB49_18380,AKU27989.1,carboxylase,MKETANQQDTLAAELEKRAAEIKKGGAPKYHEKNAAQGKLFVRERL...


In [77]:
test2 = ~iGEL604_genes['Gene'].isin(genes_in_both_dfs['Gene'])
#model_genes_missing = rxns_ec_genes[test2]
#rxns_ec_genes[test2]
iGEL604_genes.loc[test2]  # 9 Genes, for some reason indexed in a row, are missing?

# Here, it appears that the identified genes are all annotated as pseudogenes as they are disrupted. 
# Manual analysis needed.

Unnamed: 0,Gene
475,IB49_16075
476,IB49_13335
477,IB49_00430
478,IB49_05560
479,IB49_04485
480,IB49_16565
481,IB49_00915
482,IB49_00960
492,IB49_1815


In [84]:
# Finds reactions in gene reaction list that map to more than one isozyme
test123 = rxns_ec_genes.loc[~rxns_ec_genes['Gene'].isna()]
test234 = test123['Gene'].str.contains('or')
test123.loc[test234]  # 271 reactions have multiple isoenzymes in gene reaction rules

Unnamed: 0,Reaction,EC-codes,Gene
0,R00006,ec-code/2.2.1.6,IB49_05185 or IB49_05180
18,R00131,ec-code/3.5.1.5,IB49_01525 or IB49_01530 or IB49_01535
22,R00158,ec-code/2.7.4.4;ec-code/2.7.4.14;ec-code/2.7.4.22,IB49_02910 or IB49_02655 or IB49_16485
27,R00190,ec-code/2.4.2.7;ec-code/2.4.2.8,IB49_04760 or IB49_10525
30,R00200,ec-code/2.7.1.40,IB49_05580 or IB49_07195
...,...,...,...
1151,R00104,,IB49_05820 or IB49_14470
1153,R00004,,IB49_03015 or IB49_07560
1154,R00138,,IB49_03015 or IB49_07560
1186,3HB thioesterase,,IB49_00680 or IB49_01350 or IB49_1815


In [85]:
test123 = rxns_ec_genes.loc[~rxns_ec_genes['Gene'].isna()]
test234 = test123['Gene'].str.contains('and')
test123.loc[test234]  # 13 reactions are catalyzed by enzymes with more than one gene involved. All manually added?

Unnamed: 0,Reaction,EC-codes,Gene
1208,PTS,,IB49_15245 and IB49_15250 and IB49_09775 and I...
1209,PTS_Fru,,IB49_01210 and IB49_15255
1210,PTS_Man,,IB49_01210 and IB49_15255
1211,Glycerol3P_import,,IB49_01970 and IB49_01980 and IB49_01985
1212,Xylose_import,,IB49_01410 and IB49_01390
1213,Molybdate_import,,IB49_05405 and IB49_14195
1222,Phosphate_import,,IB49_04110 and IB49_04100 and IB49_04075
1223,Glucose_import(ABC),,IB49_08495 and IB49_08490 and IB49_08485 and I...
1225,Complex_I,,IB49_09280 and IB49_09330 and IB49_09325 and I...
1226,Complex_II,,IB49_05230 and IB49_05235


# Below  here is temp code to query Brenda

In [51]:
from zeep import Client
import hashlib
wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"
password = hashlib.sha256("<password here>".encode("utf-8")).hexdigest()  # Insert password at runtime
client = Client(wsdl)
parameters = ( "<email here>",password,"ecNumber*1.1.1.1","organism*Homo sapiens","kmValue*",
              "kmValueMaximum*","substrate*","commentary*","ligandStructureId*", "literature")
resultString = client.service.getKmValue(*parameters)
print (resultString)

[{
    'literature': [
        285577
    ],
    'substrate': 'more',
    'kmValue': '-999',
    'kmValueMaximum': None,
    'commentary': None,
    'organism': 'Homo sapiens',
    'ecNumber': '1.1.1.1',
    'ligandStructureId': 0
}, {
    'literature': [
        285578
    ],
    'substrate': 'more',
    'kmValue': '-999',
    'kmValueMaximum': None,
    'commentary': None,
    'organism': 'Homo sapiens',
    'ecNumber': '1.1.1.1',
    'ligandStructureId': 0
}, {
    'literature': [
        285568
    ],
    'substrate': 'more',
    'kmValue': '-999',
    'kmValueMaximum': None,
    'commentary': 'Km values for the class I isoenzymes with the substrates ethanol, methanol, ethylene glycol, benzyl alcohol, octanol, cyclohexanol and 16-hydroxyhexadecanoic acid',
    'organism': 'Homo sapiens',
    'ecNumber': '1.1.1.1',
    'ligandStructureId': 0
}, {
    'literature': [
        655206
    ],
    'substrate': 'more',
    'kmValue': '-999',
    'kmValueMaximum': None,
    'commentary': 's

In [53]:
parameters = ( "margu@kth.se",password,"ecNumber*1.1.1.1","organism*","synonyms*")
resultString = client.service.getEnzymeNames(*parameters)
print (resultString)

[{
    'synonyms': 'More',
    'ecNumber': '1.1.1.1'
}, {
    'synonyms': 'aldehyde reductase',
    'ecNumber': '1.1.1.1'
}, {
    'synonyms': 'ADH',
    'ecNumber': '1.1.1.1'
}, {
    'synonyms': 'TaDH',
    'ecNumber': '1.1.1.1'
}, {
    'synonyms': 'dehydrogenase, alcohol',
    'ecNumber': '1.1.1.1'
}, {
    'synonyms': 'alcohol dehydrogenase (NAD)',
    'ecNumber': '1.1.1.1'
}, {
    'synonyms': 'aliphatic alcohol dehydrogenase',
    'ecNumber': '1.1.1.1'
}, {
    'synonyms': 'ethanol dehydrogenase',
    'ecNumber': '1.1.1.1'
}, {
    'synonyms': 'NAD-dependent alcohol dehydrogenase',
    'ecNumber': '1.1.1.1'
}, {
    'synonyms': 'NAD-specific aromatic alcohol dehydrogenase',
    'ecNumber': '1.1.1.1'
}, {
    'synonyms': 'NADH-alcohol dehydrogenase',
    'ecNumber': '1.1.1.1'
}, {
    'synonyms': 'NADH-aldehyde dehydrogenase',
    'ecNumber': '1.1.1.1'
}, {
    'synonyms': 'primary alcohol dehydrogenase',
    'ecNumber': '1.1.1.1'
}, {
    'synonyms': 'yeast alcohol dehydrogenase