In [3]:
import pandas as pd


train_terms = pd.read_csv("data/Train/train_terms.tsv", sep="\t")
train_terms


Unnamed: 0,EntryID,term,aspect
0,Q5W0B1,GO:0000785,C
1,Q5W0B1,GO:0004842,F
2,Q5W0B1,GO:0051865,P
3,Q5W0B1,GO:0006275,P
4,Q5W0B1,GO:0006513,P
...,...,...,...
537022,Q06667,GO:0070481,P
537023,B1NF19,GO:0033075,P
537024,B1NF19,GO:0047052,F
537025,B1NF19,GO:0047056,F


In [15]:
def read_fasta_to_dataframe(fasta_filepath):
    """
    Reads a FASTA file and returns a pandas DataFrame.

    The DataFrame will have columns for 'id', 'description', and 'sequence'.
    """
    sequences = []
    for record in SeqIO.parse(fasta_filepath, "fasta"):
        sequences.append({
            'id': record.id,
            'description': record.description,
            'sequence': str(record.seq), # Convert Seq object to string
            'name':record.name,
            'dbxrefs':record.dbxrefs
        })
    return pd.DataFrame(sequences)

df = read_fasta_to_dataframe("data/Train/train_sequences.fasta")
df

Unnamed: 0,id,description,sequence,name,dbxrefs
0,sp|A0A0C5B5G6|MOTSC_HUMAN,sp|A0A0C5B5G6|MOTSC_HUMAN Mitochondrial-derive...,MRWQEMGYIFYPRKLR,sp|A0A0C5B5G6|MOTSC_HUMAN,[]
1,sp|A0JNW5|BLT3B_HUMAN,sp|A0JNW5|BLT3B_HUMAN Bridge-like lipid transf...,MAGIIKKQILKHLSRFTKNLSPDKINLSTLKGEGELKNLELDEEVL...,sp|A0JNW5|BLT3B_HUMAN,[]
2,sp|A0JP26|POTB3_HUMAN,sp|A0JP26|POTB3_HUMAN POTE ankyrin domain fami...,MVAEVCSMPAASAVKKPFDLRSKMGKWCHHRFPCCRGSGKSNMGTS...,sp|A0JP26|POTB3_HUMAN,[]
3,sp|A0PK11|CLRN2_HUMAN,sp|A0PK11|CLRN2_HUMAN Clarin-2 OS=Homo sapiens...,MPGWFKKAWYGLASLLSFSSFILIIVALVVPHWLSGKILCQTGVDL...,sp|A0PK11|CLRN2_HUMAN,[]
4,sp|A1A4S6|RHG10_HUMAN,sp|A1A4S6|RHG10_HUMAN Rho GTPase-activating pr...,MGLQPLEFSDCYLDSPWFRERIRAHEAELERTNKFIKELIKDGKNL...,sp|A1A4S6|RHG10_HUMAN,[]
...,...,...,...,...,...
82399,sp|Q9UTM1|YIV1_SCHPO,sp|Q9UTM1|YIV1_SCHPO Uncharacterized protein C...,MSKLKAQSALQKLIESQKNPNANEDGYFRRKRLAKKERPFEPKKLV...,sp|Q9UTM1|YIV1_SCHPO,[]
82400,sp|Q9Y7I1|YE1K_SCHPO,sp|Q9Y7I1|YE1K_SCHPO Uncharacterized protein C...,MSSNSNTDHSTGDNRSKSEKQTDLRNALRETESHGMPPLRGPAGFP...,sp|Q9Y7I1|YE1K_SCHPO,[]
82401,sp|Q9Y7P7|YQ63_SCHPO,sp|Q9Y7P7|YQ63_SCHPO Uncharacterized protein C...,MRSNNSSLVHCCWVSPPSLTRLPAFPSPRILSPCYCYNKRIRPFRG...,sp|Q9Y7P7|YQ63_SCHPO,[]
82402,sp|Q9Y7Q3|YQ6A_SCHPO,sp|Q9Y7Q3|YQ6A_SCHPO Uncharacterized protein C...,MHSSRRKYNDMWTARLLIRSDQKEEKYPSFKKNAGKAINAHLIPKL...,sp|Q9Y7Q3|YQ6A_SCHPO,[]


In [4]:
import pandas as pd


train_taxonomy = pd.read_csv("data/Train/train_taxonomy.tsv", sep="\t", names = ['EntryID','TaxonID'])
train_taxonomy


Unnamed: 0,EntryID,TaxonID
0,A0A0C5B5G6,9606
1,A0JNW5,9606
2,A0JP26,9606
3,A0PK11,9606
4,A1A4S6,9606
...,...,...
82399,Q9UTM1,284812
82400,Q9Y7I1,284812
82401,Q9Y7P7,284812
82402,Q9Y7Q3,284812


In [12]:
import obonet
import networkx as nx


go_graph = obonet.read_obo('data/Train/go-basic.obo')

print(f"Number of nodes: {go_graph.number_of_nodes()}")
print(f"Number of edges: {go_graph.number_of_edges()}")

# Check a root node
root_mf = 'GO:0003674'
if root_mf in go_graph:
    print(f"\nMolecular Function Root ({root_mf}): {go_graph.nodes[root_mf]['name']}")
    print(f"  Attributes: {go_graph.nodes[root_mf]}")

# Sample terms
print("\nFirst 3 terms and their names:")
for i, node_id in enumerate(list(go_graph.nodes())[:3]):
    print(f"- {node_id}: {go_graph.nodes[node_id].get('name', 'N/A')}")

Number of nodes: 40122
Number of edges: 77229

Molecular Function Root (GO:0003674): molecular_function
  Attributes: {'name': 'molecular_function', 'namespace': 'molecular_function', 'alt_id': ['GO:0005554'], 'def': '"A molecular process that can be carried out by the action of a single macromolecular machine, usually via direct physical interactions with other molecular entities. Function in this sense denotes an action, or activity, that a gene product (or a complex) performs." [GOC:pdt]', 'comment': "Note that, in addition to forming the root of the molecular function ontology, this term is recommended for the annotation of gene products whose molecular function is unknown. When this term is used for annotation, it indicates that no information was available about the molecular function of the gene product annotated as of the date the annotation was made; the evidence code 'no data' (ND), is used to indicate this. Despite its name, this is not a type of 'function' in the sense typi