# GO Term Embedding Generator for CAFA6

This notebook generates embeddings for Gene Ontology terms using graph-based features.

Outputs: `go_embeddings.npy`, `go_terms.npy`

## Setup and Load Data

In [None]:
import os
import pandas as pd
import numpy as np
import networkx as nx
import obonet
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Data paths
data_dir = '/kaggle/input/cafa-6-protein-function-prediction/'
go_obo_file = os.path.join(data_dir, 'go-basic.obo')
train_terms_file = os.path.join(data_dir, 'train_terms.tsv')

# Load GO graph
go_graph = obonet.read_obo(go_obo_file)
print(f'Loaded GO graph with {len(go_graph.nodes)} nodes')

# Load training terms to subset
train_terms = pd.read_csv(train_terms_file, sep='\t', header=None, names=['EntryID', 'term', 'aspect'])
go_terms = train_terms['term'].unique()
print(f'Unique GO terms in training: {len(go_terms)}')

# Subset graph
relevant_nodes = set(go_terms)
for term in go_terms:
    if term in go_graph:
        ancestors = nx.ancestors(go_graph, term)
        relevant_nodes.update(ancestors)
go_subgraph = go_graph.subgraph(relevant_nodes)
print(f'Relevant subgraph: {len(go_subgraph.nodes)} nodes')

## Compute GO Embeddings

In [None]:
# Compute features: degree, depth
go_features = {}
for node in go_subgraph.nodes:
    degree = go_subgraph.degree(node)
    # Approximate depth
    try:
        depth = nx.shortest_path_length(go_subgraph, source=list(go_subgraph.nodes)[0], target=node)
    except:
        depth = 0
    go_features[node] = [degree, depth]

# To array
go_terms_list = list(go_features.keys())
features_array = np.array([go_features[term] for term in go_terms_list])

# Standardize and PCA to 320 dims
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_array)
pca = PCA(n_components=320)
go_embeddings = pca.fit_transform(features_scaled)

print(f'Computed embeddings for {len(go_terms_list)} GO terms, dim: {go_embeddings.shape[1]}')

## Save Embeddings

In [None]:
# Save
np.save('/kaggle/working/go_embeddings.npy', go_embeddings)
np.save('/kaggle/working/go_terms.npy', np.array(go_terms_list))

print('GO embeddings saved to /kaggle/working/')