In [1]:
import re
import math
from collections import defaultdict

import numpy as np
import torch
from sklearn.model_selection import train_test_split

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')

Using cpu device


In [3]:
scope_file = "../../data/astral-scopedom-seqres-gd-sel-gs-bib-95-2.08.fa"
scope_pattern = re.compile("[abcdefghijkl]\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}")

In [4]:
test_header = ">d5lqwy_ g.101.1.1 (Y:) Pre-mRNA splicing factor Phf5 / Rds3 {Baker's yeast (Saccharomyces cerevisiae) [TaxId: 4932]}"
scope_pattern.search(test_header).group(0)

'g.101.1.1'

In [14]:
def load_data(data_file):
    sequences = []
    scope_codes = []

    with open(data_file, "r") as fin:
        scope_code = ""
        sequence = ""
        for line in fin:
            if line.startswith(">"):
                sequences.append(sequence)
                scope_codes.append(scope_code)

                sequence = ""
                scope_code = ""

                header = line.strip()
                m = scope_pattern.search(header)
                if m is not None:
                    scope_code = m.group(0)
            else:
                sequence += line.strip()

    sequences = sequences[1:]
    scope_codes = scope_codes[1:]
    
    return sequences, scope_codes

In [15]:
def generate_buckets(scope_codes):
    scope_levels = defaultdict(list)
    for i, scope_code in enumerate(scope_codes):
        splitted_code = scope_code.split(".")
        for j in range(1,5):
            subcode = ".".join(splitted_code[0:j])
            scope_levels[subcode].append(i)
            
    #filtered_scope_levels = dict()
    #for scope, sequence_ids in scope_levels.items():
    #    if len(sequence_ids) > bucket_size:
    #        filtered_scope_levels[scope] = sequence_ids
            
    return list(scope_levels.items())

In [94]:
def compute_scope_pair_cdf(scope_levels, total_pairs, smoothing_func):
    scope_pdf = list()
    sum_npairs = 0
    for scope_level, sequence_ids in scope_levels:
        npairs = len(sequence_ids)*(len(sequence_ids)-1)/2.0
        sum_npairs += npairs
        scope_pdf.append((scope_level, smoothing_func(npairs)))
        
    different_scope_npairs = total_pairs - sum_npairs
    print(f"total pairs {total_pairs} and different pairs {different_scope_npairs}")
    scope_pdf.append(("", smoothing_func(different_scope_npairs)))
        
    normalization_constant = sum([i[1] for i in scope_pdf])
    
    scope_pdf = [(k,v/normalization_constant) for k, v in scope_pdf]
   
    cdf = np.cumsum([i[1] for i in scope_pdf])
    scope_cdf = zip([i[0] for i in scope_pdf], cdf)

    return scope_cdf, scope_pdf

def compute_scope_cdf(scope_levels, smoothing_func):
    scope_pdf = [(scope_level, smoothing_func(len(sequence_ids))) for scope_level, sequence_ids in scope_levels]

    normalization_constant = sum([i[1] for i in scope_pdf])
    
    scope_pdf = [(k,v/normalization_constant) for k, v in scope_pdf]
    
    cdf = np.cumsum([i[1] for i in scope_pdf])
    scope_cdf = zip([i[0] for i in scope_pdf], cdf)
    
    return scope_cdf, scope_pdf

def get_sampled_element(cdf):
    a = np.random.uniform(0, 1)
    return np.argmax(cdf>=a)

def run_sampling(cdf, n=5000):
    for k in np.arange(n):
        yield get_sampled_element(cdf)

In [165]:
sequences, scope_codes = load_data(scope_file)
sequences_train, sequences_test, scope_codes_train, scope_codes_test = train_test_split(sequences, scope_codes, test_size=0.1)
scope_levels_train = generate_buckets(scope_codes_train)
total_pairs_train = len(sequences_train)*(len(sequences_train)-1)/2.0
scope_pair_cdf_train, scope_pair_pdf_train = compute_scope_pair_cdf(scope_levels_train, total_pairs_train, lambda x: np.power(x, 1))
pair_cdf_train = np.array([i[1] for i in scope_pair_cdf_train])
scope_cdf_train, scope_pdf_train = compute_scope_cdf(scope_levels_train, lambda x: x)
cdf_train = np.array([i[1] for i in scope_cdf_train])

total pairs 510161653.0 and different pairs 371609922.0


In [166]:
level_counts = defaultdict(int)
n = 10000
for i in run_sampling(cdf_train, n):
    level = len(scope_pdf_train[i][0].split("."))
    level_counts[level] += 1.0/n
level_counts

defaultdict(int,
            {2: 0.2489999999999889,
             4: 0.24959999999998883,
             1: 0.2433999999999895,
             3: 0.2579999999999879})

In [167]:
scope_pair_pdf_train[-1]

('', 0.7284160222838231)

In [168]:
sequence_pairs = list()

for i in run_sampling(pair_cdf_train, n=100000):
    if i < len(scope_levels_train):
        label, sequenceids = scope_levels_train[i]
        i1, i2 = np.random.randint(0,len(sequenceids), 2)
        seqid1 = sequenceids[i1]
        seqid2 = sequenceids[i2]
        sequence_pairs.append((sequences_train[seqid1], 
                               sequences_train[seqid2], 
                               scope_codes_train[seqid1], 
                               scope_codes_train[seqid2]))
    else:
        scope1i = 0
        scope2i = 0
        while scope1i == scope2i:
            scope1i, scope2i = [i for i in run_sampling(cdf_train, n=2)]
        scope1, sequenceids1 = scope_levels_train[scope1i]
        scope2, sequenceids2 = scope_levels_train[scope2i]
        i1 = np.random.randint(0,len(sequenceids1), 1)[0]
        i2 = np.random.randint(0,len(sequenceids2), 1)[0]
        seqid1 = sequenceids1[i1]
        seqid2 = sequenceids2[i2]
        sequence_pairs.append((sequences_train[seqid1], 
                               sequences_train[seqid2], 
                               scope_codes_train[seqid1], 
                               scope_codes_train[seqid2]))

In [169]:
count = 0
for p in sequence_pairs:
    if p[2] == p[3]:
        count += 1
count

3277

In [144]:
sequences, scope_codes = load_data(scope_file)
sequences_train, sequences_test, scope_codes_train, scope_codes_test = train_test_split(sequences, scope_codes, test_size=0.1)
scope_levels_train = generate_buckets(scope_codes_train)
scope_cdf_train, scope_pdf_train = compute_scope_cdf(scope_levels_train, lambda x: np.power(x, 0.8))
cdf_train = np.array([i[1] for i in scope_cdf_train])

In [145]:
sequence_pairs = list()

for i in range(100000):
        scope1i, scope2i = [i for i in run_sampling(cdf_train, n=2)]
        scope1, sequences1 = scope_levels_train[scope1i]
        scope2, sequences2 = scope_levels_train[scope2i]
        i1 = np.random.randint(0,len(sequences1), 1)[0]
        i2 = np.random.randint(0,len(sequences2), 1)[0]
        sequence_pairs.append((sequences1[i1], sequences2[i2], scope1, scope2))

In [146]:
count = 0
for p in sequence_pairs:
    if p[2] == p[3]:
        count += 1
count

366

In [147]:
sequence_pairs[:100]

[(10865, 11330, 'd.185.1', 'c'),
 (30315, 13272, 'd.309', 'c.55.1.0'),
 (26171, 20988, 'a', 'c'),
 (13827, 21092, 'g.3.6', 'a.32.1.1'),
 (31278, 5936, 'd.110', 'c'),
 (25095, 282, 'b.18.1.4', 'a.38.1'),
 (11874, 27738, 'd.14', 'c.1.14.0'),
 (8526, 22023, 'b.121.4.5', 'b'),
 (7129, 10178, 'd.130.1', 'g.44.1.0'),
 (2954, 13339, 'a.4.1.12', 'c.47'),
 (4361, 20060, 'a.45.1.1', 'g.101.1.1'),
 (13080, 7702, 'd.245.1', 'b'),
 (12919, 18104, 'b.1.2', 'd.58.7'),
 (7390, 28877, 'c.55.1', 'a'),
 (7446, 22349, 'd.185.1.1', 'b.74.1'),
 (18957, 29736, 'd.20.1.0', 'a.22.1'),
 (20399, 26286, 'd.92.1', 'd.139.1.1'),
 (31810, 20457, 'a.24.19.0', 'b'),
 (10645, 19534, 'b.78.1.0', 'd.186.1.1'),
 (22816, 8796, 'd.66.1', 'd.58.7.0'),
 (11626, 29086, 'g.79', 'c.37.1'),
 (154, 11591, 'b.1.1', 'b'),
 (24399, 19071, 'b.82', 'g'),
 (29226, 8688, 'b.147.1', 'd.41.2.0'),
 (27507, 11461, 'b.23', 'd'),
 (20991, 21266, 'b.19.1', 'd.198.1.1'),
 (17573, 20858, 'c.94.1.0', 'b.50.1'),
 (12703, 10519, 'c.67.1', 'c.98.1'),