## Amino acid sequences

#### Set-up and initialization

In [1]:
import os
import random
import urllib
import re

import numpy as np

In [2]:
if not os.path.isfile(os.path.join('temp', 'SCOP95.fasta')):
    url = 'http://pongor.itk.ppke.hu/benchmark/partials/repository/SCOP95/SCOP95.fasta'
    urllib.request.urlretrieve(url, os.path.join('temp', url.split('/')[-1]))

In [3]:
count = 0
proteins = {}

with open(os.path.join('temp', 'SCOP95.fasta'), 'r') as fh:
    for line in fh:
        if line and line[0] == '>':
            line = re.sub('\{.+?\}', '', line)
            name = ' '.join(line.strip().split()[3:])
            if name not in proteins:
                proteins[name] = []
                # print(name)
        elif line:
            seq = line.strip()
            proteins[name].append(seq)
            count += 1

for p in list(proteins.keys()):
    if len(proteins[p]) < 2:
        del proteins[p]
        count -= 1

len(proteins), count

(2290, 8121)

#### Sampling and output

In [4]:
np.random.seed(231555)
selections_ordered = list(np.random.choice(len(proteins), 5000, replace=True))
selections = set(selections_ordered)

protein_names = sorted(proteins.keys())

In [5]:
random.seed(271)
pairs = []
for pn in selections_ordered:
    if len(pairs) < 2400:
        seqs = proteins[protein_names[pn]]
        samp = random.sample(seqs, 2)
        if samp[0] != samp[1] and samp not in pairs and samp[::-1] not in pairs:
            pairs.append(samp)

with open('../proteins.csv', 'w') as f_out:
    f_out.write('protein1,protein2\n')
    for pair in pairs:
        f_out.write(f'{",".join(pair)}\n')

In [6]:
random.seed(3747)
pairs = []
for pn in selections_ordered:
    if len(pairs) < 2400:
        seqs = proteins[protein_names[pn]]
        samp = [_[:12] for _ in random.sample(seqs, 2)]
        if samp[0] != samp[1] and samp not in pairs and samp[::-1] not in pairs:
            pairs.append(samp)

with open('../proteins_len12.csv', 'w') as f_out:
    f_out.write('protein1,protein2\n')
    for pair in pairs:
        f_out.write(f'{",".join(pair)}\n')