# Data preparation

## Setup

In [1]:
import json
import os
from Bio import Entrez
from Bio import SeqIO

Entrez.email = 'tgw325@alumni.ku.dk'

## Downloading disordered regions

The sequences of the proteins in **accession_numbers.json** are downloaded.
Afterwards, their disordered region is saved in FASTA format.

In [3]:
# Accessing file containing accession numbers:
with open('accession_numbers.json', 'r') as file:
    accession_numbers = json.load(file)

# Looping over protein groups
print("Exctracting the following IDRs:")
for group, proteins in accession_numbers.items():

    # Creating dirs for groups
    os.makedirs(group, exist_ok=True)

    # Looping over proteins
    for name, description in proteins.items():

        # Getting data in Genbank format
        acc_num = description['acc_num']
        handle = Entrez.efetch(db='protein',id=acc_num, rettype='gp', retmode='text')
        record = SeqIO.read(handle, 'genbank')

        # Finding disordered regions of protein
        dis_seqs = []
        locs = []
        for feature in record.features:
            if feature.type == 'Region':
                if 'Disordered' in feature.qualifiers['region_name'][0]:

                    # Extracting disordered sequence
                    dis_seqs.append(feature.extract(record.seq))
                    locs.append(feature.location)
        
        # Choosing either NTD or CTD disordered region (in case mulitple exists)
        terminal  = description['terminal']
        if terminal == 'NTD':
            dis_seq = dis_seqs[0]
            loc = locs[0]
        elif terminal == 'CTD':
            dis_seq = dis_seqs[-1]
            loc = locs[-1]
        else:
            raise ValueError()

        # Saving disordered region as FASTA file
        print(f" - {name} {terminal} ({acc_num}):\t{len(dis_seq)} AA\t{loc}")
        record.seq = dis_seq
        with open(f'{group}/{name + " " + terminal}.fasta', 'w') as file: 
            record.description += f" disordered {terminal}"
            SeqIO.write(record, file, 'fasta')


Exctracting the following IDRs:
 - Human H1-0 CTD (NP_005309.1):	111 AA	[83:194]
 - Human H1-1 CTD (NP_005316.1):	122 AA	[93:215]
 - Human H1-2 CTD (NP_005310.1):	122 AA	[91:213]
 - Human H1-3 CTD (NP_005311.1):	132 AA	[89:221]
 - Human H1-4 CTD (NP_005312.1):	128 AA	[91:219]
 - Human H2B NTD (NP_003519.1):	35 AA	[0:35]
 - Human H3-4 NTD (NP_003484.1):	43 AA	[0:43]
