# Task
Main task of the script is to output "1519377.tsv" for G. LC300 for use in Gecko to create EC model. Format:

Entry/Protein_id  Gene names (ordered locus) EC Number Mass (calculated) Sequence

Protein id: AKU... , taken from Genbank genome file for LC300
Gene_name: Locus tag (IB49_...), used to match between Genbank genome and genes/reactions in iGEL604
EC_no: Big question at the moment. Each gene may be implied in several reactions in iGEL604, with different EC no's
Sequence: Aa sequence for the protein, taken from Genbank sequence

In [1]:
import pandas as pd
from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [2]:
# Constants
taxonomic_id = '1519377'  # Geobacillus sp. LC300

In [3]:
# Read information from LC300 genome file
with open('gene_prot_aaseq.csv', 'r') as f:  # Contains annotation information from all CDS in LC300 genome
    gene_prot_aa = pd.read_csv(f, sep=',')
    f.close()

gene_prot_aa.tail(15)

Unnamed: 0,Gene,Protein,Name,Protein_sequence
2818,IB49_18420,AKU27996.1,2-deoxy-D-gluconate 3-dehydrogenase,MFLPSFRLGGKTALVTGAGRGIGRAIAIGFAEAGADVALIARTEAD...
2819,IB49_18425,AKU27997.1,hypothetical protein,MIHHTWATRPTIKKVKCVHTNAEKYMVSNVLTPGKVYEVKNETDEF...
2820,IB49_18430,AKU27998.1,histidine kinase,MAGLYINQHVLNNLFYILVTIFAFSFIYDHSRAIRQRPLYGQALLG...
2821,IB49_18435,AKU27999.1,hypothetical protein,MHWLCPVFQQPNRQDAKERQHAAKPHSCAVRRQIGDFAEHDRTKRP...
2822,IB49_18440,AKU28000.1,membrane associated protein,MREDFRLPPHPVYVPVTLIRDGQLLADELAELGKTEQWLAAKLQKQ...
2823,IB49_18445,AKU28001.1,GntR family transcriptional regulator,MFELDIRSRQPIYEQLIDKMKEMIVRELWQPHDQLPSVRTMAKQLM...
2824,IB49_18450,AKU28002.1,ABC transporter,MIQLVDVTKMFDRFAAVKGANMMVPKGAIYGLLGPNGAGKTTLLKM...
2825,IB49_00430,Pseudoprot1,hypothetical protein,MENQNRQNAAQCPFHGSVTNQSSNRTTNKDWWPNQLNLSILHQHDE...
2826,IB49_00915,Pseudoprot2,hypothetical protein,LRQYLQLLEDILENGVEKEDRTGVGTLSVFGRQLRFNLQDGFPLVT...
2827,IB49_00960,Pseudoprot3,hypothetical protein,LPINIPKDLPAKEILEQETFSSWTKNGRIRKISARSISLF


In [4]:
# Read reaction-gene information from iGEL604
with open('rxns_ec_genes.csv', 'r') as f:  # Contains all gene reaction rules + EC numbers from iGEL604
    rxns_ec_genes = pd.read_csv(f, sep=',')
    f.close()

rxns_ec_genes = rxns_ec_genes.rename(columns={'rxns':'Reaction', 'eccodes': 'EC-codes', 'genes' : 'Gene'})
rxns_ec_genes.head(15)

Unnamed: 0,Reaction,EC-codes,Gene,Unnamed: 3,Unnamed: 4
0,R00006,ec-code/2.2.1.6,IB49_05185 or IB49_05180,,
1,R00013,ec-code/4.1.1.47,IB49_05185,,
2,R00019,"ec-code/1.12.7.2,ec-code/1.12.99.-",IB49_09315,,
3,R00025,ec-code/1.13.12.16,IB49_01360,,
4,R00026,ec-code/3.2.1.21,IB49_08540,,
5,R00036,ec-code/4.2.1.24,IB49_05095,,
6,R00066,ec-code/2.5.1.9,IB49_03270,,
7,R00068,ec-code/1.10.3.3,IB49_14805,,
8,R00078,ec-code/1.16.3.1,IB49_14805,,
9,R00084,ec-code/2.5.1.61,IB49_05105,,


In [5]:
test = rxns_ec_genes[rxns_ec_genes['Gene'].str.contains('IB49_05185',na=False)]  # Test. Finds all reactions with IB49_05185
test

Unnamed: 0,Reaction,EC-codes,Gene,Unnamed: 3,Unnamed: 4
0,R00006,ec-code/2.2.1.6,IB49_05185 or IB49_05180,,
1,R00013,ec-code/4.1.1.47,IB49_05185,,
33,R00207,ec-code/1.2.3.3,IB49_05185,,
39,R00226,ec-code/2.2.1.6,IB49_05185 or IB49_05180,,
548,R03145,ec-code/1.2.5.1,IB49_05185,,
692,R04673,ec-code/2.2.1.6,IB49_05185 or IB49_05180,,
954,R08648,ec-code/2.2.1.6,IB49_05185 or IB49_05180,,


In [8]:
# Read list of all genes from iGEL626
with open('iGEL626_genes.csv','r') as infile:
    iGEL604_genes = pd.read_csv(infile)
    infile.close()
iGEL604_genes

Unnamed: 0,Gene
0,IB49_13935
1,IB49_13225
2,IB49_02295
3,IB49_07995
4,IB49_14525
...,...
621,IB49_16515
622,IB49_05415
623,IB49_05420
624,11830


In [9]:
rxns_ec_genes['EC-codes'] = rxns_ec_genes['EC-codes'].str.replace('ec-code/', '')  # Removes 'ec-code/' from the codes
rxns_ec_genes

Unnamed: 0,Reaction,EC-codes,Gene,Unnamed: 3,Unnamed: 4
0,R00006,2.2.1.6,IB49_05185 or IB49_05180,,
1,R00013,4.1.1.47,IB49_05185,,
2,R00019,"1.12.7.2,1.12.99.-",IB49_09315,,
3,R00025,1.13.12.16,IB49_01360,,
4,R00026,3.2.1.21,IB49_08540,,
...,...,...,...,...,...
1265,R03656,6.1.1.5,IB49_15960,,
1266,R03661,6.1.1.15,IB49_16515,,
1267,R03660,6.1.1.20,IB49_05415 and IB49_05420,,
1268,R03905,6.3.5.7,IB49_11825 and 11830,,


# Strategy for matching genes and ec codes in iGEL604 to protein names and sequences

1) Hitta subset av rader i "gene_prot_aa" där gene_prot_aa['Gene'] finns i rxns_ec_genes['Gene']
2) använd gene_prot_aa.loc[<bool-serie från 1] för att extrahera raderna

In [10]:
test = gene_prot_aa['Gene'].isin(iGEL604_genes['Gene'])
genes_in_both_dfs = gene_prot_aa.loc[test]
genes_in_both_dfs  # Finds 624 genes from the model in the extracted data

Unnamed: 0,Gene,Protein,Name,Protein_sequence
22,IB49_00155,AKU25202.1,glutamine synthetase,MSKTFVSSTQTGLLEQIKETIQQKNVELLHLQFVDIEGILKHVTVT...
32,IB49_00225,AKU25211.1,NADPH-dependent oxidoreductase,MNAVIETILRHRSIRRFEERPLTDEQIRTIVECAQAASTSSYVQAY...
34,IB49_00235,AKU25213.1,acyl-CoA dehydrogenase,MYLRLTDEQRMVQKAIRKFVEKELMPLENEVLRNEWEGKPGLAPEK...
35,IB49_00240,AKU25214.1,3-oxoacyl-ACP reductase,MNQRFAGRVAFVTGGSRGIGKAIVTRFAEEGAKVAFIDLNEEALEA...
38,IB49_00260,AKU25217.1,acetyl-CoA acetyltransferase,MKRDAVIVSAVRTAIARQGGALATLPAHIYGAEVIKEAMRRANIGP...
...,...,...,...,...
2828,IB49_04485,Pseudoprot4,hypothetical protein,MENVYGLLGFPVEHSLSPLMHNDAFVRLGIPARYHLFSVHPKQVSE...
2829,IB49_05560,Pseudoprot5,hypothetical protein,VTQGEKITVTNGVLNVPNNPIIPFIEGDGTGPDIWAAASRVLEAAV...
2830,IB49_13335,Pseudoprot6,hypothetical protein,MRVLVVGAGAVGGYFGGRLLEKGVDVTFLVRERRKRELEERGLVIR...
2831,IB49_16075,Pseudoprot7,hypothetical protein,MIKGKHILLCVTGGWRRTRRRCSPASSSSAAQK


In [11]:
# Build uniprot dataframe?
uniprot_file_df = pd.DataFrame(columns = ['Entry', 'Gene names (ordered locus)', 'EC Number','Mass', 'Sequence'])
uniprot_file_df['Entry'] = genes_in_both_dfs['Protein']
uniprot_file_df['Gene names (ordered locus)'] = genes_in_both_dfs['Gene']
#uniprot_file_df['Gene names (primary)'] = 'N/A'
uniprot_file_df['EC Number'] = 'N/A'
#uniprot_file_df['Cross-reference (GeneID)'] = 'N/A'
#uniprot_file_df['Cross-reference (RefSeq)'] = 'N/A'
uniprot_file_df['Sequence'] = genes_in_both_dfs['Protein_sequence']

sequences = uniprot_file_df['Sequence'].tolist()
MWs = []
for sequence in sequences:
    analyzer = ProteinAnalysis(sequence)
    mw = analyzer.molecular_weight()
    MWs.append(mw)
uniprot_file_df['Mass'] = MWs
uniprot_file_df  # Finished, but missing EC number

Unnamed: 0,Entry,Gene names (ordered locus),EC Number,Mass,Sequence
22,AKU25202.1,IB49_00155,,50017.8871,MSKTFVSSTQTGLLEQIKETIQQKNVELLHLQFVDIEGILKHVTVT...
32,AKU25211.1,IB49_00225,,28391.1067,MNAVIETILRHRSIRRFEERPLTDEQIRTIVECAQAASTSSYVQAY...
34,AKU25213.1,IB49_00235,,44412.6117,MYLRLTDEQRMVQKAIRKFVEKELMPLENEVLRNEWEGKPGLAPEK...
35,AKU25214.1,IB49_00240,,27394.7499,MNQRFAGRVAFVTGGSRGIGKAIVTRFAEEGAKVAFIDLNEEALEA...
38,AKU25217.1,IB49_00260,,41103.7148,MKRDAVIVSAVRTAIARQGGALATLPAHIYGAEVIKEAMRRANIGP...
...,...,...,...,...,...
2828,Pseudoprot4,IB49_04485,,20295.1950,MENVYGLLGFPVEHSLSPLMHNDAFVRLGIPARYHLFSVHPKQVSE...
2829,Pseudoprot5,IB49_05560,,7342.3071,VTQGEKITVTNGVLNVPNNPIIPFIEGDGTGPDIWAAASRVLEAAV...
2830,Pseudoprot6,IB49_13335,,14024.1660,MRVLVVGAGAVGGYFGGRLLEKGVDVTFLVRERRKRELEERGLVIR...
2831,Pseudoprot7,IB49_16075,,3643.2794,MIKGKHILLCVTGGWRRTRRRCSPASSSSAAQK


In [12]:
uniprot_file_df.to_csv('%s.tsv' %(taxonomic_id), sep='\t', index=False)

# Missing genes
9 genes are missing from the extracted CDS from the genome. These appear to be annotated as pseudogenes, probably manually added to iGEL604 by Emil. Will come back and look at these later

In [19]:
test2 = ~iGEL604_genes['Gene'].isin(genes_in_both_dfs['Gene'])
#model_genes_missing = rxns_ec_genes[test2]
#rxns_ec_genes[test2]
missing_genes = iGEL604_genes.loc[test2, 'Gene'].values.tolist()  # 9 Genes, for some reason indexed in a row, are missing?

# 9 Genes, for some reason indexed in a row, are missing? Manually added?
for gene in missing_genes:
    print(gene)

IB49_16075
IB49_13335
IB49_00430
IB49_05560
IB49_04485
IB49_16565
IB49_00915
IB49_00960
IB49_1815


In [32]:
df = pd.DataFrame()
rxns_ec_genes_nona = rxns_ec_genes.loc[~rxns_ec_genes['Gene'].isna()]

for gene in missing_genes:
    id_bool = rxns_ec_genes_nona['Gene'].str.contains(str(gene))
    print(rxns_ec_genes_nona.loc[id_bool])


     Reaction EC-codes        Gene
1131   R04231      NaN  IB49_16075
1132   R03269      NaN  IB49_16075
     Reaction EC-codes        Gene
1133   R02472      NaN  IB49_13335
     Reaction EC-codes        Gene
1134   R00009      NaN  IB49_00430
     Reaction EC-codes        Gene
1135   R00267      NaN  IB49_05560
     Reaction EC-codes        Gene
1136   R02413      NaN  IB49_04485
     Reaction EC-codes        Gene
1137   R00549      NaN  IB49_16565
1138   R00161      NaN  IB49_16565
     Reaction EC-codes        Gene
1141   R02101      NaN  IB49_00915
     Reaction EC-codes        Gene
1143   R01777      NaN  IB49_00960
              Reaction EC-codes                                    Gene
1145            R04014      NaN  IB49_00680 or IB49_01350 or IB49_18150
1146            R08163      NaN  IB49_00680 or IB49_01350 or IB49_18150
1147            R08159      NaN  IB49_00680 or IB49_01350 or IB49_18150
1148            R01706      NaN  IB49_00680 or IB49_01350 or IB49_18150
1186  3HB 

# Below  here is temp code to query Brenda

In [51]:
from zeep import Client
import hashlib
wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"
password = hashlib.sha256("<password here>".encode("utf-8")).hexdigest()  # Insert password at runtime
client = Client(wsdl)
parameters = ( "<email here>",password,"ecNumber*1.1.1.1","organism*Homo sapiens","kmValue*",
              "kmValueMaximum*","substrate*","commentary*","ligandStructureId*", "literature")
resultString = client.service.getKmValue(*parameters)
print (resultString)

[{
    'literature': [
        285577
    ],
    'substrate': 'more',
    'kmValue': '-999',
    'kmValueMaximum': None,
    'commentary': None,
    'organism': 'Homo sapiens',
    'ecNumber': '1.1.1.1',
    'ligandStructureId': 0
}, {
    'literature': [
        285578
    ],
    'substrate': 'more',
    'kmValue': '-999',
    'kmValueMaximum': None,
    'commentary': None,
    'organism': 'Homo sapiens',
    'ecNumber': '1.1.1.1',
    'ligandStructureId': 0
}, {
    'literature': [
        285568
    ],
    'substrate': 'more',
    'kmValue': '-999',
    'kmValueMaximum': None,
    'commentary': 'Km values for the class I isoenzymes with the substrates ethanol, methanol, ethylene glycol, benzyl alcohol, octanol, cyclohexanol and 16-hydroxyhexadecanoic acid',
    'organism': 'Homo sapiens',
    'ecNumber': '1.1.1.1',
    'ligandStructureId': 0
}, {
    'literature': [
        655206
    ],
    'substrate': 'more',
    'kmValue': '-999',
    'kmValueMaximum': None,
    'commentary': 's

In [53]:
parameters = ( "<PUT EMAIL HERE>",password,"ecNumber*1.1.1.1","organism*","synonyms*")
resultString = client.service.getEnzymeNames(*parameters)
print (resultString)

[{
    'synonyms': 'More',
    'ecNumber': '1.1.1.1'
}, {
    'synonyms': 'aldehyde reductase',
    'ecNumber': '1.1.1.1'
}, {
    'synonyms': 'ADH',
    'ecNumber': '1.1.1.1'
}, {
    'synonyms': 'TaDH',
    'ecNumber': '1.1.1.1'
}, {
    'synonyms': 'dehydrogenase, alcohol',
    'ecNumber': '1.1.1.1'
}, {
    'synonyms': 'alcohol dehydrogenase (NAD)',
    'ecNumber': '1.1.1.1'
}, {
    'synonyms': 'aliphatic alcohol dehydrogenase',
    'ecNumber': '1.1.1.1'
}, {
    'synonyms': 'ethanol dehydrogenase',
    'ecNumber': '1.1.1.1'
}, {
    'synonyms': 'NAD-dependent alcohol dehydrogenase',
    'ecNumber': '1.1.1.1'
}, {
    'synonyms': 'NAD-specific aromatic alcohol dehydrogenase',
    'ecNumber': '1.1.1.1'
}, {
    'synonyms': 'NADH-alcohol dehydrogenase',
    'ecNumber': '1.1.1.1'
}, {
    'synonyms': 'NADH-aldehyde dehydrogenase',
    'ecNumber': '1.1.1.1'
}, {
    'synonyms': 'primary alcohol dehydrogenase',
    'ecNumber': '1.1.1.1'
}, {
    'synonyms': 'yeast alcohol dehydrogenase