# Data preprocessing

## Load data

In [22]:
import gzip
interactions = {}
data = []
# Load data
org_id = '9606' # Change to 9606 for Human

with gzip.open(f'data/{org_id}.protein.links.v11.0.txt.gz', 'rt') as f:
    next(f) # Skip header
    for line in f:
        p1, p2, score = line.strip().split()
        if float(score) < 700: # Filter high confidence interactions
            continue
        if p1 not in interactions:
            interactions[p1] = set()
        if p2 not in interactions:
            interactions[p2] = set()
        if p2 not in interactions[p1]:
            interactions[p1].add(p2)
            interactions[p2].add(p1)
            data.append((p1, p2))

print('Total number of interactions:', len(data))
print('Total number of proteins:', len(interactions.keys()))


Total number of interactions: 420534
Total number of proteins: 17185


## Split training, validation and testing data


In [25]:
import numpy as np
import math

np.random.seed(seed=0) # Fix random seed for reproducibility
np.random.shuffle(data)
train_n = int(math.ceil(len(data) * 0.8))
valid_n = int(math.ceil(train_n * 0.8))
train_data = data[:valid_n]
valid_data = data[valid_n:train_n]
test_data = data[train_n:]
print('Number of training interactions:', len(train_data))
print('Number of validation interactions:', len(valid_data))
print('Number of testing interactions:', len(test_data))

Number of training interactions: 269143
Number of validation interactions: 67285
Number of testing interactions: 84106


## Save the data

In [27]:
def save(filename, data):
    with open(filename, 'w') as f:
        for p1, p2 in data:
            f.write(f'{p1}\t{p2}\n')
            f.write(f'{p2}\t{p1}\n')

save(f'data/train/{org_id}.protein.links.v11.0.txt', train_data)
save(f'data/valid/{org_id}.protein.links.v11.0.txt', valid_data)
save(f'data/test/{org_id}.protein.links.v11.0.txt', test_data)

## Generate negative interactions

In [None]:
import random
proteins =set ()
negatives = []
for (p1,p2) in data:
        proteins.add(p1)
        proteins.add(p2)
while len(negatives)<len(data):
        s = random.sample(proteins, 2)
        prot1= s[0]
        prot2= s[1]
        if (prot1,prot2) in negatives or (prot2,prot1) in negatives :
                 continue
        if prot1 not in interactions[prot2]:
                 negatives.append((prot1, prot2))
print('Total number of negative interactions:', len(negatives))
# Split negative data
neg_train_data = negatives[:valid_n]
neg_valid_data = negatives[valid_n:train_n]
neg_test_data = negatives[train_n:]
print('Number of negative training interactions:', len(neg_train_data))
print('Number of negative validation interactions:', len(neg_valid_data))
print('Number of negative testing interactions:', len(neg_test_data))
# Save negative data 
save(f'data/train/{org_id}.negative_interactions.txt', neg_train_data)
save(f'data/valid/{org_id}.negative_interactions.txt', neg_valid_data)
save(f'data/test/{org_id}.negative_interactions.txt', neg_test_data)

## Preprocess GO annotations
### Load id mapping between annotation database and StringDB IDs

In [None]:
mapping = {}
source = {'4932': 'SGD_ID', '9606': 'Ensembl_UniProt_AC'} # mapping source

with gzip.open(f'data/{org_id}.protein.aliases.v11.0.txt.gz', 'rt') as f:
    next(f) # Skip header
    for line in f:
        string_id, p_id, sources = line.strip().split('\t')
        if source[org_id] not in sources.split():
            continue
        if p_id not in mapping:
            mapping[p_id] = set()
        mapping[p_id].add(string_id)
print('Loaded mappings', len(mapping))


### Load annotations

In [None]:
gaf_files = {'4932': 'sgd.gaf.gz', '9606': 'goa_human.gaf.gz'}
annotations = set()
with gzip.open(f'data/{gaf_files[org_id]}', 'rt') as f:
    for line in f:
        if line.startswith('!'): # Skip header
            continue
        it = line.strip().split('\t')
        p_id = it[1]
        go_id = it[4]
        if it[6] == 'IEA' or it[6] == 'ND': # Ignore predicted or no data annotations
            continue
        if p_id not in mapping: # Not in StringDB
            continue
        s_ids = mapping[p_id]
        for s_id in s_ids:
            annotations.add((s_id, go_id))
print('Number of annotations:', len(annotations))

# Save annotations
with open(f'data/train/{org_id}.annotation.txt', 'w') as f:
    for p_id, go_id in annotations:
        f.write(f'{p_id}\t{go_id}\n')
        

## Generate Plain Training Data

In [None]:
import os

tdf = open(f'data/train/{org_id}.plain.nt', 'w')
# Load GO
with open('data/go.obo') as f:
    tid = ''
    for line in f:
        line = line.strip()
        if line.startswith('id:'):
            tid = line[4:]
        if not tid.startswith('GO:'):
            continue
        if line.startswith('is_a:'):
            tid2 = line[6:].split(' ! ')[0]
            tdf.write(f'<http://{tid}> <http://is_a> <http://{tid2}> .\n')
        if line.startswith('relationship:'):
            it = line[14:].split(' ! ')[0].split()
            tdf.write(f'<http://{tid}> <http://{it[0]}> <http://{it[1]}> .\n')

# Load interactions
with open(f'data/train/{org_id}.protein.links.v11.0.txt') as f:
    for line in f:
        it = line.strip().split()
        tdf.write(f'<http://{it[0]}> <http://interacts> <http://{it[1]}> .\n')

# Load annotations
with open(f'data/train/{org_id}.annotation.txt') as f:
    for line in f:
        it = line.strip().split()
        tdf.write(f'<http://{it[0]}> <http://hasFunction> <http://{it[1]}> .\n')

tdf.close()
if not os.path.exists('data/transe'):
    os.makedirs('data/transe')
! wc -l 'data/train/{org_id}.plain.nt'

## Generate Classes Training Data for ELEmbeddings

In [None]:
if not os.path.exists('data/elembeddings'):
    os.makedirs('data/elembeddings')
! groovy el-embeddings/GenerateTrainingDataClasses -i 'data/train/{org_id}.protein.links.v11.0.txt' -a 'data/train/{org_id}.annotation.txt' -o 'data/train/{org_id}.classes.owl'


### Normalize training data classes into four normal forms

In [None]:

! groovy -cp el-embeddings/jar/jcel.jar el-embeddings/Normalizer.groovy -i 'data/train/{org_id}.classes.owl' -o 'data/train/{org_id}.classes-normalized.owl'



## Generate RDF Representation of ELEmbeddings training data

In [None]:

! rapper 'data/train/{org_id}.classes.owl' -o ntriples > 'data/train/{org_id}.classes-rdf.nt'


## Generate Onto/OPA2Vec-compatible associations

In [19]:
import re
# generate OPA2VEC compatible associations
with open(f'data/train/{org_id}.OPA_associations.txt', 'w') as f:
    for p_id, go_id in annotations:
        go_num = re.split ('[A-Z]|:+',go_id)
        f.write (str(p_id)+" \t"+"<http://purl.obolibrary.org/obo/GO_"+str(go_num[3])+">\n")