In [38]:
import re
import math
import json
import csv
from collections import defaultdict

import numpy as np
import torch
from sklearn.model_selection import train_test_split

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')

Using cpu device


In [3]:
scope_file = "../../data/astral-scopedom-seqres-gd-sel-gs-bib-95-2.08.fa"
scope_pattern = re.compile("[abcdefghijkl]\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}")

In [4]:
test_header = ">d5lqwy_ g.101.1.1 (Y:) Pre-mRNA splicing factor Phf5 / Rds3 {Baker's yeast (Saccharomyces cerevisiae) [TaxId: 4932]}"
scope_pattern.search(test_header).group(0)

'g.101.1.1'

In [5]:
def load_data(data_file):
    sequences = []
    scope_codes = []

    with open(data_file, "r") as fin:
        scope_code = ""
        sequence = ""
        for line in fin:
            if line.startswith(">"):
                sequences.append(sequence)
                scope_codes.append(scope_code)

                sequence = ""
                scope_code = ""

                header = line.strip()
                m = scope_pattern.search(header)
                if m is not None:
                    scope_code = m.group(0)
            else:
                sequence += line.strip()

    sequences = sequences[1:]
    scope_codes = scope_codes[1:]
    
    return sequences, scope_codes

In [16]:
def get_scope_similarity_level(code1, code2):
    split_code1 = code1.split(".")
    split_code2 = code2.split(".")
    for i in range(4,0,-1):
        if split_code1[:i] == split_code2[:i]:
            return ".".join(split_code1[:i]), i
    return "", 0 

def get_scope_level(scope_code):
    return len(scope_code.split("."))
        
def generate_scope_pairs(scope_codes):
    scope_pairs = defaultdict(list)
    for i, scope_codei in enumerate(scope_codes):
        for j, scope_codej in enumerate(scope_codes[i+1:]):
            similarity_code, _ = get_scope_similarity_level(scope_codei, scope_codej)
            scope_pairs[similarity_code].append((i, j+i+1))
    return dict(scope_pairs)

def get_scope_level_pdf(scope_pairs):
    level_distribution = defaultdict(int)
    for scope_code, pairs in scope_pairs.items():
        if scope_code == "":
            level_distribution[0] += len(pairs)
        else:
            level_distribution[get_scope_level(scope_code)] += len(pairs)
            
    normalization_constant = sum(level_distribution.values())
    
    for scope_level, v in level_distribution.items():
        level_distribution[scope_level] /= normalization_constant
        
    return level_distribution, normalization_constant

def compute_scope_pair_cdf(scope_level_pairs, smoothing_func):
    scope_pdf = list()
    sum_npairs = 0
    for scope_level, pairs in scope_level_pairs.items():
        npairs = len(pairs)
        sum_npairs += npairs
        scope_pdf.append((scope_level, smoothing_func(npairs)))
        
    normalization_constant = sum([i[1] for i in scope_pdf])
    
    scope_pdf = [(k,v/normalization_constant) for k, v in scope_pdf]
   
    cdf = np.cumsum([i[1] for i in scope_pdf])
    scope_cdf = [(k,v) for k, v in zip([i[0] for i in scope_pdf], cdf)]

    return scope_cdf, scope_pdf

def get_sampled_element(cdf):
    a = np.random.uniform(0, 1)
    return np.argmax(cdf>=a)

def run_sampling(cdf, n=5000):
    for k in np.arange(n):
        yield get_sampled_element(cdf)

In [13]:
print(get_scope_similarity_level("a.1.10.100", "a.1.10.100"))
print(get_scope_similarity_level("a.1.10.100", "a.1.10.101"))
print(get_scope_similarity_level("a.1.10.100", "a.1.11.101"))
print(get_scope_similarity_level("a.1.10.100", "a.0.11.101"))
print(get_scope_similarity_level("a.1.10.100", "b.1.10.100"))
print(get_scope_level("a.1.10.100"))
print(get_scope_level("a"))

('a.1.10.100', 4)
('a.1.10', 3)
('a.1', 2)
('a', 1)
('', 0)
4
1


In [8]:
sequences, scope_codes = load_data(scope_file)
sequences_train, sequences_test, scope_codes_train, scope_codes_test = train_test_split(sequences, scope_codes, test_size=0.1)
scope_pairs_train = generate_scope_pairs(scope_codes_train)

In [19]:
del scope_pairs_train[0]
del scope_pairs_train[1]

In [20]:
scope_level_pdf, normalization_constant = get_scope_level_pdf(scope_pairs_train)
print(normalization_constant, len(sequences_train)*(len(sequences_train)-1)/2)    

510161653 510161653.0


In [22]:
scope_level_pdf

defaultdict(int,
            {1: 0.19767118011905924,
             0: 0.7778375377029759,
             3: 0.009940004251946393,
             4: 0.00707713521541377,
             2: 0.007474142710604711})

In [37]:
scope_pair_cdf_train, scope_pair_pdf_train = compute_scope_pair_cdf(scope_pairs_train, lambda x: np.power(x, 0.75))

In [42]:
nsets = 100
for seti in range(nsets):
    with open(f"../../data/train_set_{seti}.tsv", "w") as fout:
        tsvwriter = csv.writer(fout, delimiter='\t')
        for i in run_sampling(np.array([i[1] for i in scope_pair_cdf_train]), n=100000):
            scope_code, _ = scope_pair_cdf_train[i]
            pairs = scope_pairs_train[scope_code]
            pairi = np.random.randint(0,len(pairs), None)
            i1, i2 = pairs[pairi]
            common_scope_code, common_level = get_scope_similarity_level(scope_codes_train[i1],
                                                                         scope_codes_train[i2])
            tpl = (sequences_train[i1], 
                    sequences_train[i2], 
                    scope_codes_train[i1], 
                    scope_codes_train[i2],
                   common_level,
                  )
            tsvwriter.writerow(tpl)

In [36]:
level_distribution = defaultdict(int)
with open("../../data/train_set_0.tsv", "r") as fin:
    tsvreader = csv.csvreader(fin, delimiter="\t")
    for p in tsvreader:
        common_scope_code, level = get_scope_similarity_level(p[2],p[3])
        level_distribution[level] += 1/len(sequence_pairs)
level_distribution

defaultdict(int,
            {0: 0.23309999999999065,
             1: 0.2560999999999881,
             3: 0.18849999999999556,
             4: 0.23059999999999092,
             2: 0.0917000000000016})

In [None]:
def make_datasets(sequences, scope_codes, prefix="train", nsets=100, npairs=100000):
    scope_pairs = generate_scope_pairs(scope_codes)
    scope_pair_cdf, scope_pair_pdf = compute_scope_pair_cdf(scope_pairs, lambda x: np.power(x, 0.75))
    for seti in range(nsets):
        with open(f"../../data/{prefix}_set_{seti}.tsv", "w") as fout:
            tsvwriter = csv.writer(fout, delimiter='\t')
            for i in run_sampling(np.array([i[1] for i in scope_pair_cdf]), n=npairs):
                scope_code, _ = scope_pair_cdf[i]
                pairs = scope_pairs[scope_code]
                pairi = np.random.randint(0,len(pairs), None)
                i1, i2 = pairs[pairi]
                common_scope_code, common_level = get_scope_similarity_level(scope_codes[i1],
                                                                             scope_codes[i2])
                tpl = (sequences[i1], 
                        sequences[i2], 
                        scope_codes[i1], 
                        scope_codes[i2],
                       common_level,
                      )
                tsvwriter.writerow(tpl)    