# Data preparation

### Setup

In [3]:
from tqdm import tqdm
import pandas as pd
import json
import os
from src import process_data

## Getting template metadata

This section uses general code blocks to prepare template metadata in experiment .json files.

Fetching metadata requires a starting metadata .json file containing UniProt IDs (See `~/exp/README.md`).
Custom IDs can be arbitrary, and may be reassigned during the following procedures.

#### How to get template metadata
1. **Specify experiment**
2. **Specify templates**
3. **Get metadata**
4. **Format custom protein IDs** (_Optional_)
5. **Save metadata**

#### 1. Experiment name

In [7]:
experiment = 'initial'

In [74]:
experiment = 'ortho'

#### 2. Specify templates

In [75]:
# Getting template UniProt IDs by experiment-specific methods
if experiment == 'initial':
    uids = ['P07305', 'B4DR52']
    ids = ['H1-0', 'H2B']

elif experiment == 'ortho':
    # Getting BLAST hits
    with open('exp/ortho/H1-0_pblast.txt', 'r') as file:
        lines = file.readlines()
    uids = [line.strip().split(':')[1] for line in lines]
    # Setting arbitrary custom protein IDs for now
    ids = range(len(uids))

# Loading into metadata with arbitrary custom IDs
metadata_path = f'exp/{experiment}/{experiment}.json'
metadata = {'templates': {id: {"uniprot_id": uniprot_id} for uniprot_id, id in zip(uids, ids)}}
with open(metadata_path, 'w') as file:
    json.dump(metadata, file, indent=4)

#### 3. Get template metadata

In [76]:
# Getting experiment json
with open(metadata_path, 'r') as file:
    metadata = json.load(file)

# Getting names, descriptions, and species
for id, fields in tqdm(metadata['templates'].items()):
    uniprot_id = fields['uniprot_id']
    name, desc, spec, seq = process_data.get_protein_metadata(uniprot_id)
    metadata['templates'][id] = {'uniprot_id': uniprot_id, 'name': name, 'description': desc, 'species': spec, 'sequence': seq}

metadata

100%|██████████| 250/250 [00:49<00:00,  5.09it/s]


{'templates': {'0': {'uniprot_id': 'P07305',
   'name': 'H10_HUMAN',
   'description': 'Histone H1.0',
   'species': 'Homo sapiens (Human)',
   'sequence': 'MTENSTSAPAAKPKRAKASKKSTDHPKYSDMIVAAIQAEKNRAGSSRQSIQKYIKSHYKVGENADSQIKLSIKRLVTTGVLKQTKGVGASGSFRLAKSDEPKKSVAFKKTKKEIKKVATPKKASKPKKAASKAPTKKPKATPVKKAKKKLAATPKKAKKPKTVKAKPVKASKPKKAKPVKPKAKSSAKRAGKKK'},
  '1': {'uniprot_id': 'A0A8D2F8B3',
   'name': 'A0A8D2F8B3_THEGE',
   'description': 'Histone H1.0',
   'species': 'Theropithecus gelada (Gelada baboon)',
   'sequence': 'MTENSTSAPAAKPKRAKASKKSTDHPKYSDMIVAAIQAEKNRAGSSRQSIQKYIKSHYKVGENADSQIKLSIKRLVTTGVLKQTKGVGASGSFRLAKSDEPKKSVAFKKTKKEIKKVATPKKASKPKKAASKAPTKKPKATPVKKAKKKLAATPKKAKKPKTVKAKPVKASKPKKAKPVKPKAKSSAKRAGKKK'},
  '2': {'uniprot_id': 'A0A2K5H919',
   'name': 'A0A2K5H919_COLAP',
   'description': 'H15 domain-containing protein',
   'species': "Colobus angolensis palliatus (Peters' Angolan colobus)",
   'sequence': 'MTENSTSAPAAKPKRAKASKKSTDHPKYSDMIVAAIQAEKNRAGSSRQSIQKYIKSHYKVGENADSQIKL

#### 4. Format custom protein IDs _(Optional)_

In [77]:
# Setting IDs as UniProt protein name species suffix
if experiment == 'ortho':
    metadata['templates'] = {record['name'].split('_')[1]: record for record in metadata['templates'].values()}

metadata['templates'].keys()

dict_keys(['HUMAN', 'THEGE', 'COLAP', 'MACNE', 'GORGO', 'MANLE', '9PRIM', 'PANPA', 'PAPAN', 'CERAT', 'PANTR', 'MACMU', 'PONAB', 'NOMLE', 'RHIBE', 'RHIRO', 'CHLSB', 'SAIBB', 'CALJA', 'SAPAP', 'AOTNA', 'CEBIM', 'ACIJB', 'CROCR', 'LYNCA', 'CASCN', 'ERIEU', 'FELCA', 'VICPA', 'PIPKU', 'BALAS', 'PHOSS', 'NEOVI', 'LIPVE', 'BALMU', 'CALUR', 'AILME', 'PIG', 'OCTDE', 'MYOLU', 'MYOMY', 'MYODS', 'ENHLU', 'TUPCH', 'MUNRE', 'SURSU', '9CETA', 'PHYMC', 'ODOVR', 'CANLF', 'MUSPF', 'CAVPO', 'PANTA', 'PTEAL', 'HIPAR', 'MOSMO', 'PTEVA', 'TURTR', 'ROUAE', 'LEPWE', 'MOLMO', '9CHIR', 'RHIFE', 'GALPY', 'BOVIN', 'HETGA', 'DICBM', 'BOSIN', 'BISBI', 'BOBOX', 'URSAM', 'URSMA', 'NYCPR', 'EQUAS', 'VULVU', 'ODORO', 'PANLE', 'SHEEP', 'CAPHI', 'MONMO', 'HORSE', 'DELLE', 'LOXAF', 'NANGA', 'FUKDA', 'RAT', 'MOUSE', '9PASS', 'MUSSI', 'OTOGA', 'MICOH', 'CRIGR', 'ORYAF', 'ICTTR', 'CHRAS', 'PROSS', 'UROPR', 'MARMO', 'MESAU', 'DIPOR', 'PERMB', 'SCIVU', 'MARMA', 'TRIMA', 'JACJA', 'BALPH', 'CHRPI', 'MYOBR', '9SAUR', 'TERCA', 'CA

#### 5. Saving metadata

In [78]:
# Saving metadata
with open(metadata_path, 'w') as file:
    json.dump(metadata, file, indent=4)

## Filtering templates

### H1.0 orthologs

Some hits from the BLAST are irrelevant, as they are most likely not orthologs, like:
- "2-amino-3-ketobutyrate coenzyme A ligase, mitochondrial"
- "8-amino-7-oxononanoate synthase-like"
- "(raccoon dog) hypothetical protein"

Additionally, some hits are more subtle orthologs, like other members of the H15 superfamily:
- "H5 protein"
- "H15 domain-containing protein"

Lastly, some are in a grey area of  the H1.0 definition:
- "histone H1.0-like"
- "Histone H1.0-B"
- "histone H1.0-B-like"
- "H1 histone family, member 0"

This section is used to specify the names of the orthologs to be used for analysis

In [79]:
# Loading metadata
metadata_path = "exp/ortho/ortho.json"
with open(metadata_path, 'r') as file:
    metadata = json.load(file)['templates']

templates = pd.DataFrame(metadata).transpose()

In [92]:
# Finding all unique descriptions / names
templates['description'].value_counts()

H1.0 linker histone                                        53
H5 protein                                                 43
Histone H1.0                                               34
histone H1.0                                               29
H15 domain-containing protein                              16
Histone H5                                                  8
histone H1.0-like                                           4
2-amino-3-ketobutyrate coenzyme A ligase, mitochondrial     2
H10 protein                                                 2
(raccoon dog) hypothetical protein                          1
8-amino-7-oxononanoate synthase-like                        1
H1 histone family member 0                                  1
histone H1.0-B-like                                         1
Histone H1.0-A                                              1
H1 histone family, member 0                                 1
Histone H1.0-B                                              1
histone 

In [81]:
# Setting description requirements
required  = ['H1', 'H5' ]

In [85]:
# Filtering off
filtered = templates[templates.apply(lambda row: max([req in row['description'] for req in required]), axis=1)]

# Displaying proteins that were filtered off
templates.loc[list(set(templates.index) - set(filtered.index))]

Unnamed: 0,uniprot_id,name,description,species,sequence
NYCPR,A0A811YLF5,A0A811YLF5_NYCPR,(raccoon dog) hypothetical protein,Nyctereutes procyonoides (Raccoon dog),MTENSTSTPAAKPKRAKASKKSTDHPKYSDMIVAAIQAEKNRAGSS...
9SAUR,A0A6I9XPX0,A0A6I9XPX0_9SAUR,8-amino-7-oxononanoate synthase-like,Thamnophis sirtalis,MWCSQFFRSPFVLGQCSPSRAQSALAQLKHLLEGELEGIRGAGTWK...
MYOBR,S7QB19,S7QB19_MYOBR,"2-amino-3-ketobutyrate coenzyme A ligase, mito...",Myotis brandtii (Brandt's bat),MTENSTSAPAAKPKRAKASKKSTDHPKYSDMIVAAIQAEKNRAGSS...
CAMFR,S9YWH7,S9YWH7_CAMFR,"2-amino-3-ketobutyrate coenzyme A ligase, mito...",Camelus ferus (Wild bactrian camel),MTENSTSTPAAKPKRAKASKKSTDHPKYSDMIVAAIQAEKNRAGSS...


In [86]:
# Saving to metadata
metadata['templates'] = filtered.to_dict(orient='index')
with open(metadata_path, 'w') as file:
    json.dump(metadata, file, indent=4)

## Generating sequences for simulations

This section uses general code blocks to assemble IDR sequences and save them as FASTA files and metadata .json files.

Generating data requires a starting metadata .json file containing UniProt IDs (See `~/exp/README.md`)

#### How to prepare data
1. **Specify experiment**
2. **Specify variants**
3. **Generate data**
4. **Save sequences**
5. **Save metadata**

### 1. Experiment name

In [13]:
experiment = 'initial'

In [87]:
experiment = 'ortho'

### 2. Variant types

In [3]:
# All
variants = list(process_data.variant_types.keys())

In [88]:
# Only wildtype
variants = 'wt'

### 3. Generating data

In [89]:
# Getting experiment json
metadata_path = f'exp/{experiment}/{experiment}.json'
with open(metadata_path, 'r') as file:
    metadata = json.load(file)

# Initiating DataFrame
proteins = []
not_IDP = []
for id, fields in tqdm(metadata['templates'].items()):
    try:
        seq, loc, reg = process_data.get_protein_idr(uniprot_id=fields['uniprot_id'], i_idr=0, length_order=True)
    except ValueError:
        not_IDP.append((id, fields['uniprot_id']))
        continue
    proteins.append({'template': id, 'sequence': seq, 'location': loc, 'region': reg})
if not_IDP:
    print(f"The following {len(not_IDP)} UniProt IDs did not return IDPs:")
    print(*[f" - {uid}\t({id})" for id, uid in not_IDP], sep='\n')
data = pd.DataFrame(proteins)

# Expanding dataframe to include all variants
data['variant'] = [variants]*len(data)
if type(variants) == list:
    data = data.explode('variant')
    data.index = data['template'] + '_' + data['variant'].str.upper()
else:
    data.index = data['template']

# Generating variant sequence
data['sequence'] = data.apply(lambda row: process_data.generate_variant(row['sequence'], row['variant']), axis=1)

# Setting variant description
data['variant'] = data.apply(lambda row: process_data.variant_types[row['variant']]['name'], axis=1)

data

100%|██████████| 195/195 [00:56<00:00,  3.45it/s]


Unnamed: 0_level_0,template,sequence,location,region,variant
template,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
HUMAN,HUMAN,TKGVGASGSFRLAKSDEPKKSVAFKKTKKEIKKVATPKKASKPKKA...,83:194,CTD,Wild type
THEGE,THEGE,TKGVGASGSFRLAKSDEPKKSVAFKKTKKEIKKVATPKKASKPKKA...,83:194,CTD,Wild type
COLAP,COLAP,TKGVGASGSFRLAKSDEPKKSVAFKKTKKEIKKVATPKKASKPKKA...,83:194,CTD,Wild type
MACNE,MACNE,TKGVGASGSFRLAKSDEPKKSVAFKKTKKEIKKVATPKKASKPKKA...,83:194,CTD,Wild type
GORGO,GORGO,TKGVGASGSFRLAKSDEPKKSVAFKKTKKEIKKVATPKKASKPKKA...,83:194,CTD,Wild type
...,...,...,...,...,...
9COLU,9COLU,AKKSLARKKKAARRSTSPRRAARPKKAKSPSKKRRSTTKKARKKSR...,104:193,CTD,Wild type
TODME,TODME,VLKQTKGVGASGSFRLAKAIKAKKSPAKKRKKAARRSTSPRKAARS...,81:191,CTD,Wild type
HERCA,HERCA,QTKGVGASGSFRLAKAEKVKKSPARKRKKAARKSTSPRKAARPKKA...,84:192,CTD,Wild type
EOLRO,EOLRO,KQTKGVGASGSFRLAKGNKAKRSPSRKRRKKVARKSTSPRKAARHR...,83:193,CTD,Wild type


### 4. Saving sequences

In [90]:
# Saving sequences
datadir = f'exp/{experiment}/data'
os.makedirs(datadir, exist_ok=True)

# Iterating over rows
for index, cols in data.iterrows():
    id = index.upper()
    desc = cols['variant']
    seq = cols['sequence']

    # Saving as FASTA format (Though one-line sequence)
    filepath = datadir + '/' + id + '.fasta'
    with open(filepath, 'w') as file:
        file.write('>' + id + ' ' + desc + '\n')
        file.write(seq)

### 5. Saving metadata

In [91]:
# Saving metadata
metadata['data'] = data.to_dict(orient='index')
with open(metadata_path, 'w') as file:
    json.dump(metadata, file, indent=4)