# Data preparation

## Setup

In [78]:
import json
import os
from Bio import Entrez
from Bio import SeqIO

Entrez.email = 'tgw325@alumni.ku.dk'

## Downloading disordered regions

The sequences of the proteins in **accession_numbers.json** are downloaded.
Afterwards, their disordered region is saved in FASTA format.

In [77]:
# Accessing file containing accession numbers:
with open('accession_numbers.json', 'r') as file:
    accession_numbers = json.load(file)

# Looping over protein groups
print("Exctracting the following IDRs:")
for group, proteins in accession_numbers.items():

    # Creating dirs for groups
    os.makedirs(group, exist_ok=True)

    # Looping over proteins
    for name, acc_num in proteins.items():

        # Getting data in Genbank format
        handle = Entrez.efetch(db='protein',id=acc_num, rettype='gp', retmode='text')
        record = SeqIO.read(handle, 'genbank')

        # Finding disordered region of protein
        for feature in record.features:
            if feature.type == 'Region':
                if 'Disordered' in feature.qualifiers['region_name'][0]:

                    # Extracting disordered sequence
                    dis_seq = feature.extract(record.seq)
                    loc = feature.location

        # Saving disordered region as FASTA file
        print(f" - {name} ({acc_num}):\t{len(dis_seq)} AA\t{loc}")
        record.seq = dis_seq
        with open(f'{group}/{acc_num}.fasta', 'w') as file: 
            SeqIO.write(record, file, 'fasta')


Exctracting the following IDRs:
 - Human histone H2B (NP_003519.1):	35 AA	[0:35]
