## Amino acid sequences

#### Set-up and initialization

In [1]:
import os
import random
import urllib

import numpy as np

In [2]:
if (not os.path.isfile(os.path.join('temp', 'uniref50.lines')) and
    not os.path.isfile(os.path.join('temp', 'uniref50.fasta'))):
    url = 'ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref50/uniref50.fasta.gz'
    urllib.request.urlretrieve(url, os.path.join('temp', url.split('/')[-1]))

    import gzip
    import shutil
    with gzip.open(os.path.join('temp', url.split('/')[-1]), 'rb') as f_in:
        with open(os.path.join('temp', 'uniref50.fasta'), 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

In [3]:
count = 0
if not os.path.isfile(os.path.join('temp', 'uniref50.lines')):
    line_ct = 0
    with open(os.path.join('temp', 'uniref50.lines'), 'w') as f_out:
        with open(os.path.join('temp', 'uniref50.fasta'), 'r') as f_in:
            for line in f_in:
                line_ct += 1
                if line and line[0] == '>':
                    if count:
                        f_out.write('\n')
                    count += 1
                else:
                    f_out.write(line.strip())
            f_out.write('\n')
else:
    with open(os.path.join('temp', 'uniref50.lines')) as f_count:
        for line in f_count:
            count += 1
count

35763834

### Sampling and output

In [4]:
np.random.seed(231555)
selections_ordered = list(np.random.choice(count, 4800, replace=False))
selections = set(selections_ordered)

proteins = {}
line_no = 0
with open(os.path.join('temp', 'uniref50.lines'), 'r') as f_in:
    for line in f_in:
        if line_no in selections:
            proteins[line_no] = line.strip()
        line_no += 1

In [5]:
with open('../proteins.csv', 'w') as f_out:
    f_out.write('protein1,protein2\n')
    for i in range(len(selections_ordered)):
        f_out.write(proteins[selections_ordered[i]])
        f_out.write('\n' if i%2 else ',')

In [6]:
random.seed(3747)
with open('../proteins_len12.csv', 'w') as f_out:
    f_out.write('protein1,protein2\n')
    for i in range(len(selections_ordered)):
        subsample = proteins[selections_ordered[i]]
        start = random.randint(0, len(subsample)-12)
        f_out.write(subsample[start:start+12])
        f_out.write('\n' if i%2 else ',')