In [None]:
# Reads all coding sequences from the LC300 genome and extracts gene id, protein id and aa sequence

In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import pandas as pd

In [2]:
import csv

In [44]:
# Read LC300 genome from genbank file
with open("../../../data/LC300_genome.gb", 'r') as f:
    genome = SeqIO.read(f, 'gb')
    f.close()

In [45]:
# Save all coding sequences to cds
cds = []
for feature in genome.features:
    if feature.type == "CDS":
        cds.append(feature)
        


In [46]:
# Put gene id, protein id, and translation into array
proteinsList = [['Gene', 'Protein', 'Name', 'Protein_sequence']];
for i in range(len(cds)):   #range(len(features)):
    locus = str(cds[i].qualifiers["locus_tag"]).strip("[\']")
    protein = str(cds[i].qualifiers["protein_id"]).strip("[\']")
    seq = str(cds[i].qualifiers["translation"]).strip("[\']")
    name = str(cds[i].qualifiers["product"]).strip("[\']")
    proteinsList.append([locus, protein,name, seq])

In [47]:
proteinsDF = pd.DataFrame(proteinsList)
proteinsDF.columns=proteinsDF.iloc[0]
proteinsDF.drop([0], inplace=True)
proteinsDF

Unnamed: 0,Gene,Protein,Name,Protein_sequence
1,IB49_00015,AKU25180.1,hypothetical protein,MKKKRFTVAEGETIAACLARMKQEGYRPVRRIEQPIFREVETNGET...
2,IB49_00020,AKU25181.1,hypothetical protein,MMDEQESKRQFQDDLDQYRMDNVIHAPKHYVYQVGYEASSGNPTGG...
3,IB49_00030,AKU25182.1,hypothetical protein,MIHHTWATRPTIKKVKCVHTNAEKYMVSNVLTPGKVYEVKNETDEF...
4,IB49_00040,AKU25183.1,hypothetical protein,MHWLCPVFQQPNRQDAKERQHAAKPHSCAVRRQIGDFAEHDRTKRP...
5,IB49_00045,AKU25184.1,membrane associated protein,MREDFRLPPHPVYVPVTLIRDGQLLADELAELGKTEQWLAAKLQKQ...
...,...,...,...,...
2821,IB49_18430,AKU27998.1,histidine kinase,MAGLYINQHVLNNLFYILVTIFAFSFIYDHSRAIRQRPLYGQALLG...
2822,IB49_18435,AKU27999.1,hypothetical protein,MHWLCPVFQQPNRQDAKERQHAAKPHSCAVRRQIGDFAEHDRTKRP...
2823,IB49_18440,AKU28000.1,membrane associated protein,MREDFRLPPHPVYVPVTLIRDGQLLADELAELGKTEQWLAAKLQKQ...
2824,IB49_18445,AKU28001.1,GntR family transcriptional regulator,MFELDIRSRQPIYEQLIDKMKEMIVRELWQPHDQLPSVRTMAKQLM...


In [48]:
hypotheticalProteins = proteinsDF[proteinsDF['Name'].str.contains("hypothetical")]
hypotheticalProteins

Unnamed: 0,Gene,Protein,Name,Protein_sequence
1,IB49_00015,AKU25180.1,hypothetical protein,MKKKRFTVAEGETIAACLARMKQEGYRPVRRIEQPIFREVETNGET...
2,IB49_00020,AKU25181.1,hypothetical protein,MMDEQESKRQFQDDLDQYRMDNVIHAPKHYVYQVGYEASSGNPTGG...
3,IB49_00030,AKU25182.1,hypothetical protein,MIHHTWATRPTIKKVKCVHTNAEKYMVSNVLTPGKVYEVKNETDEF...
4,IB49_00040,AKU25183.1,hypothetical protein,MHWLCPVFQQPNRQDAKERQHAAKPHSCAVRRQIGDFAEHDRTKRP...
8,IB49_00065,AKU25187.1,hypothetical protein,MKLIVVWLMFALLAIGTMVSMDQLMGMTLHQSLHIVLNPFRVMKAP...
...,...,...,...,...
2803,IB49_18325,AKU27980.1,hypothetical protein,MPMKFTDDLYEYYKDRLTGDEEDAEAVAMSILDELDRRDVLKLIGE...
2817,IB49_18410,AKU27994.1,hypothetical protein,MKKKRFTVAEGETIAACLARMKQEGYRPVRRIEQPIFREVETNGET...
2818,IB49_18415,AKU27995.1,hypothetical protein,MMDEQESKRQFQDDLDQYRMDNVIHAPKHYVYQVGYEASSGNPTGG...
2820,IB49_18425,AKU27997.1,hypothetical protein,MIHHTWATRPTIKKVKCVHTNAEKYMVSNVLTPGKVYEVKNETDEF...


In [49]:
nonHypotheticalProteins = proteinsDF[~proteinsDF['Name'].str.contains("hypothetical")]
nonHypotheticalProteins

Unnamed: 0,Gene,Protein,Name,Protein_sequence
5,IB49_00045,AKU25184.1,membrane associated protein,MREDFRLPPHPVYVPVTLIRDGQLLADELAELGKTEQWLAAKLQKQ...
6,IB49_00050,AKU25185.1,GntR family transcriptional regulator,MFELDIRSRQPIYEQLIDKMKEMIVRELWQPHDQLPSVRTMAKQLM...
7,IB49_00055,AKU25186.1,ABC transporter,MIQLVDVTKMFDRFAAVKGANMMVPKGAIYGLLGPNGAGKTTLLKM...
9,IB49_00070,AKU25188.1,spore gernimation protein KB,MLFQWGMGRQKKQINDRVKQEGPDHSGEAADVPQEPMSAELAVNLD...
10,IB49_00075,AKU25189.1,spore gernimation protein KC,MKRPIAMFVSFFVCAVLLAGCWSKKELTDLGVVIAVGLDKTKDGRY...
...,...,...,...,...
2819,IB49_18420,AKU27996.1,2-deoxy-D-gluconate 3-dehydrogenase,MFLPSFRLGGKTALVTGAGRGIGRAIAIGFAEAGADVALIARTEAD...
2821,IB49_18430,AKU27998.1,histidine kinase,MAGLYINQHVLNNLFYILVTIFAFSFIYDHSRAIRQRPLYGQALLG...
2823,IB49_18440,AKU28000.1,membrane associated protein,MREDFRLPPHPVYVPVTLIRDGQLLADELAELGKTEQWLAAKLQKQ...
2824,IB49_18445,AKU28001.1,GntR family transcriptional regulator,MFELDIRSRQPIYEQLIDKMKEMIVRELWQPHDQLPSVRTMAKQLM...


In [53]:
with open("../../../output/deepec/hypotheticalProteins.fasta", "w") as outfile:
    for index, row in hypotheticalProteins.iterrows():
        outfile.write(">" + row['Gene'] + " " + row["Protein"]+"\n")
        outfile.write(row["Protein_sequence"]+ "\n")
        

In [54]:
with open("../../../output/deepec/annotatedProteins.fasta", "w") as outfile:
    for index, row in nonHypotheticalProteins.iterrows():
        outfile.write(">" + row['Gene'] + " " + row["Protein"]+"\n")
        outfile.write(row["Protein_sequence"]+ "\n")