# Data preparation

### Setup

In [6]:
import pandas as pd
import json
import os
from datetime import datetime
from src import process_data

## Generating sequences for experiments

This section uses general code blocks to assemble IDR sequences and save them as FASTA files and metadata .json files.

Generating data requires a starting metadata .json file containing custom protein IDs and accession numbers (See ``~/exp/README.md``)

#### How to prepare data
1. **Specify experiment**
2. **Specify variants**
3. **Generate data**
4. **Save sequences**
5. **Save metadata**

### 1. Experiment name

In [2]:
experiment = 'initial'

### 2. Variant types

In [21]:
# Only wildtype
variants = 'wt'

In [24]:
# All
variants = process_data.variant_types.keys()

### 3. Generating data

In [31]:
# Getting experiment json
metadata_path = f'exp/{experiment}/{experiment}.json'
with open(metadata_path, 'r') as file:
    metadata = json.load(file)

# Initiating DataFrame
gp_files = [process_data.get_protein_gp(an) for an in metadata['accession_numbers'].values()]
data = pd.DataFrame(index=metadata['accession_numbers'].keys())

# Gathering data
data['template'] = data.index
data['name'] = [gp.description.split('[')[0][:-1] for gp in gp_files]
data['species'] = [gp.annotations['source'] for gp in gp_files]

# Extracting IDR
data['sequence'] = [process_data.extract_idr(gp, length_order=True)[0] for gp in gp_files]
data['region'] = [process_data.extract_idr(gp, length_order=True)[1] for gp in gp_files]
data['location'] = [process_data.extract_idr(gp, length_order=True)[2] for gp in gp_files]

# Expanding dataframe to included all variants
data['variant'] = [variants]*len(data)
data = data.explode('variant')
if len(variants) > 1:
    data.index = data['template'] + '_' + data['variant'].str.upper()

# Generating variant sequence
data['sequence'] = data.apply(lambda row: process_data.generate_variant(row['sequence'], row['variant']), axis=1)

# Setting variant description
data['variant'] = data.apply(lambda row: process_data.variant_types[row['variant']]['name'], axis=1)

data

Unnamed: 0,template,name,species,sequence,region,location,variant
H1-0_WT,H1-0,histone H1.0,Homo sapiens (human),TKGVGASGSFRLAKSDEPKKSVAFKKTKKEIKKVATPKKASKPKKA...,CTD,[83:194],Wild type
H1-0_RAND,H1-0,histone H1.0,Homo sapiens (human),VADSKKTSAKAKKKKAKPAKRKAPTGKKPSSTAKKKKAPKKKAKLK...,CTD,[83:194],Randomly shuffled
H1-0_CLUST,H1-0,histone H1.0,Homo sapiens (human),KKKRKKKKKKKKKKKKKKKKKRKKKKKKKKKKKKKKKKKKKKKKKA...,CTD,[83:194],Terminally clustered charges
H2B_WT,H2B,histone H2B type 2-E,Homo sapiens (human),MPEPAKSAPAPKKGSKKAVTKAQKKDGKKRKRSRK,NTD,[0:35],Wild type
H2B_RAND,H2B,histone H2B type 2-E,Homo sapiens (human),PPKKRPKSAEVDSPKKSAMAKAKAKGKGTQKKKRR,NTD,[0:35],Randomly shuffled
H2B_CLUST,H2B,histone H2B type 2-E,Homo sapiens (human),KRKRKKKKKKKKRKKAPAVTGAQAMPGASPSPSDE,NTD,[0:35],Terminally clustered charges


### 4. Saving sequences

In [8]:
# Saving sequences
datadir = f'exp/{experiment}/data'
os.makedirs(datadir, exist_ok=True)

# Iterating over rows
for index, cols in data.iterrows():
    id = index.upper()
    desc = process_data.variant_types[cols['variant']]['name']
    seq = cols['sequence']

    # Saving as FASTA format (Though one-line sequence)
    filepath = datadir + '/' + id + '.fasta'
    with open(filepath, 'w') as file:
        file.write('>' + id + '\n')
        file.write(seq)

### 5. Saving metadata

In [32]:
# Saving metadata
metadata['data'] = data.to_dict(orient='index')
with open(metadata_path, 'w') as file:
    json.dump(metadata, file, indent=4)

## Finding orthologs