# Extracting Gene Ontology (GO) terms from annotated toxins to generate an annotation summary using GO Slim

Let's import the required modules:

In [58]:
import pandas as pd
import numpy as np

Loading toxins content (prot_ID and GO) from Toxins_transcriptome.xlsx into dataframe:

In [59]:
toxins = pd.read_excel("Toxins_transcriptome.xlsx", usecols=["prot_id", "gene_ontology_BLASTX", "gene_ontology_BLASTP", "gene_ontology_Pfam"], sheet_name="Somente toxinas") 

Renaming GO columns:

In [60]:
toxins.rename(columns={'gene_ontology_BLASTX': 'GO_Blastx', 'gene_ontology_BLASTP': 'GO_Blastp', 'gene_ontology_Pfam': 'GO_Pfam'}, inplace=True)

Removing rows with no prot_id or lacking any GO annotation: 

In [61]:
toxins.replace('', np.nan, inplace=True) #replaces empty strings with NaN
toxins.dropna(axis=0, subset=['prot_id'], inplace=True) #Removes empty prot_id rows
toxins.dropna(axis=0, subset=['GO_Blastx', 'GO_Blastp', 'GO_Pfam'], inplace=True) #Removes rows without any GO annotation

Exporting cleaned dataframe to excel:

In [62]:
toxins.to_excel("GO_terms.xlsx", index=False)

The dataframe has been manually check to confirm that it was properly cleaned - i.e. has no rows without prot_id identification or rows lacking any GO annotation.

It is now time to parse the data:

**Idea**: Create a dictionary of sets, where:

- prot_id are keys

- GO_terms are values (inside sets)

In [64]:
GO = dict()
for row in toxins.itertuples():
    go_annotations = [row.GO_Blastx, row.GO_Blastp, row.GO_Pfam]
    concat = "^".join(go_annotations) #Join all annotations into a single string
    concat = concat.replace("`", "^").split("^") #Splits GO terms using the "^" separator
    for value in concat:
        if value.startswith("GO:"): #If value startswith GO:, it has to be a GO identifier
            try:
                GO[row.prot_id].add(value) # Try to add to existing key
            except KeyError:
                GO[row.prot_id] = {value} # If key does not exist, create it. Since value is a set, no duplicate values will be added

{'TRINITY_DN9244_c0_g1_i1.p1': {'GO:0090729', 'GO:0005576', 'GO:0008270', 'GO:0006508', 'GO:0004222'}, 'TRINITY_DN8384_c0_g1_i4.p1': {'GO:0004222', 'GO:0005576', 'GO:0006508', 'GO:0046872'}, 'TRINITY_DN8948_c1_g2_i1.p1': {'GO:0090729', 'GO:0005576', 'GO:0016021', 'GO:0005515', 'GO:0006887', 'GO:0044218'}, 'TRINITY_DN8959_c2_g1_i32.p1': {'GO:0090729', 'GO:0005576', 'GO:0008061', 'GO:0006030'}, 'TRINITY_DN8195_c0_g1_i8.p1': {'GO:0090729', 'GO:0005576', 'GO:0016021', 'GO:0007165', 'GO:0005515', 'GO:0006887', 'GO:0044218'}, 'TRINITY_DN9154_c0_g1_i15.p1': {'GO:0090729', 'GO:0005576', 'GO:0008061', 'GO:0006030'}, 'TRINITY_DN9193_c1_g2_i1.p1': {'GO:0005576', 'GO:0005975', 'GO:0004415'}, 'TRINITY_DN8384_c0_g1_i1.p1': {'GO:0005576', 'GO:0016021', 'GO:0005886', 'GO:0004175', 'GO:0046692', 'GO:0008237', 'GO:0005615', 'GO:0006508', 'GO:0004222', 'GO:0046872'}, 'TRINITY_DN8425_c0_g2_i1.p1': {'GO:0090729', 'GO:0005576', 'GO:0016021', 'GO:0005515', 'GO:0006887', 'GO:0044218'}, 'TRINITY_DN9192_c1_g1_i

Finally, the dictionary generated will be parsed to write the go_summary:

In [67]:
with open("TOXINS_GO_SUMMARY.txt", "w") as go_summary:
    for k, v in GO.items():
        for go in v:
            go_summary.write("{}\t{}\n".format(k, go)) #Writes prot_id and GO to final file, one pair per line

OBS: It is also possible to achieve the same results using comprehensions:

In [93]:
def parse_toxin_row():
    for row in toxins.itertuples(): 
        values = {value for value in "^".join([row.GO_Blastx, row.GO_Blastp, row.GO_Pfam]).replace("`", "^").split("^")
                  if value.startswith("GO")} #Set comprehension
        yield (row.prot_id, values)

GO_DICT_COMP = {prot_id: values for prot_id, values in parse_toxin_row()} #Dictionary comprehension

GO_SUMMARY_COMP = ("{}\t{}".format(key, value) for key, vals in GO_DICT_COMP.items() for value in vals) #Generator comprenhension
for i in GO_SUMMARY_COMP:
    print(i)

TRINITY_DN9244_c0_g1_i1.p1	GO:0090729
TRINITY_DN9244_c0_g1_i1.p1	GO:0005576
TRINITY_DN9244_c0_g1_i1.p1	GO:0008270
TRINITY_DN9244_c0_g1_i1.p1	GO:0006508
TRINITY_DN9244_c0_g1_i1.p1	GO:0004222
TRINITY_DN8384_c0_g1_i4.p1	GO:0004222
TRINITY_DN8384_c0_g1_i4.p1	GO:0005576
TRINITY_DN8384_c0_g1_i4.p1	GO:0006508
TRINITY_DN8384_c0_g1_i4.p1	GO:0046872
TRINITY_DN8948_c1_g2_i1.p1	GO:0090729
TRINITY_DN8948_c1_g2_i1.p1	GO:0005576
TRINITY_DN8948_c1_g2_i1.p1	GO:0016021
TRINITY_DN8948_c1_g2_i1.p1	GO:0005515
TRINITY_DN8948_c1_g2_i1.p1	GO:0006887
TRINITY_DN8948_c1_g2_i1.p1	GO:0044218
TRINITY_DN8959_c2_g1_i32.p1	GO:0090729
TRINITY_DN8959_c2_g1_i32.p1	GO:0005576
TRINITY_DN8959_c2_g1_i32.p1	GO:0008061
TRINITY_DN8959_c2_g1_i32.p1	GO:0006030
TRINITY_DN8195_c0_g1_i8.p1	GO:0090729
TRINITY_DN8195_c0_g1_i8.p1	GO:0005576
TRINITY_DN8195_c0_g1_i8.p1	GO:0016021
TRINITY_DN8195_c0_g1_i8.p1	GO:0007165
TRINITY_DN8195_c0_g1_i8.p1	GO:0005515
TRINITY_DN8195_c0_g1_i8.p1	GO:0006887
TRINITY_DN8195_c0_g1_i8.p1	GO:0044218
TRINITY_