# Data preparation

### Setup

In [18]:
import pandas as pd
import json
import itertools
import random
import shutil
from datetime import datetime
from src import process_data

### Downloading and preprocessing data

In [19]:
# Loading seqs.json
json_seqs_path = 'data/seqs/seqs.json'
seqs = pd.read_json(json_seqs_path, orient='index')

# Downloading data
seqs['raw_data'] = seqs.apply(lambda row: process_data.get_protein_gp(acc_num=row['accession_number'], filename=row.name), axis=1)

# Extracting longest IDRs
applied = seqs.apply(lambda row: process_data.extract_idr_fasta(gp_path=row['raw_data'], i_idr=0, length_order=True), axis=1, result_type='expand')
seqs['idr_data'] = applied.iloc[:,0]
seqs['idr_location'] = applied.iloc[:,1]

# Updating metadata in seqs.json (Note: df.to_json formats '/' in paths as '\/')
seqs['timestamp'] = str(datetime.now())
with open (json_seqs_path, 'w') as file:
    json.dump(seqs.to_dict('index'), file, indent=4)
seqs

Unnamed: 0,name,accession_number,raw_data,idr_data,idr_location,timestamp
H1-0,Human H1-0,NP_005309.1,data/seqs/raw/H1-0.gp,data/seqs/idr/H1-0_CTD.fasta,[83:194],2023-02-15 21:06:42.317770
H1-1,Human H1-1,NP_005316.1,data/seqs/raw/H1-1.gp,data/seqs/idr/H1-1_CTD.fasta,[93:215],2023-02-15 21:06:42.317770
H1-2,Human H1-2,NP_005310.1,data/seqs/raw/H1-2.gp,data/seqs/idr/H1-2_CTD.fasta,[91:213],2023-02-15 21:06:42.317770
H1-3,Human H1-3,NP_005311.1,data/seqs/raw/H1-3.gp,data/seqs/idr/H1-3_CTD.fasta,[89:221],2023-02-15 21:06:42.317770
H1-4,Human H1-4,NP_005312.1,data/seqs/raw/H1-4.gp,data/seqs/idr/H1-4_CTD.fasta,[91:219],2023-02-15 21:06:42.317770
H2B,Human H2B,NP_003519.1,data/seqs/raw/H2B.gp,data/seqs/idr/H2B_NTD.fasta,[0:35],2023-02-15 21:06:42.317770
H3-4,Human H3-4,NP_003484.1,data/seqs/raw/H3-4.gp,data/seqs/idr/H3-4_NTD.fasta,[0:43],2023-02-15 21:06:42.317770


### Generating variants

In [21]:
# Loading seqs.json
json_seqs_path = 'data/seqs/seqs.json'
seqs = pd.read_json(json_seqs_path, orient='index')

# Resetting variants
json_vars_path = 'data/seqs/vars.json'
try:
    shutil.rmtree('data/seqs/var/', )
except FileNotFoundError:
    pass

# Preparing variant dataframe
variants = pd.DataFrame(list(itertools.product(seqs.index, process_data.variant_types)), columns=['template', 'variant'])
variants.index = variants.apply(lambda row: row['template'] + '_' + row['variant'].upper(), axis=1)
variants['seed'] = random.randint(0, 1e10)

# Generating variants
variants["var_data"] = variants.apply(lambda row: process_data.generate_variant_fasta(fasta_path=seqs['idr_data'][row['template']], variant=row['variant'], filename=row.name, seed=row['seed']), axis=1)

# Updating metadata in seqs.json (Note: df.to_json formats '/' in paths as '\/')
variants['timestamp'] = str(datetime.now())
with open (json_vars_path, 'w') as file:
    json.dump(variants.to_dict('index'), file, indent=4)
variants

Unnamed: 0,template,variant,seed,var_data,timestamp
H1-0_WT,H1-0,wt,7420687855,data/seqs/var/H1-0_WT.fasta,2023-02-15 21:09:07.043747
H1-0_RAND,H1-0,rand,7420687855,data/seqs/var/H1-0_RAND.fasta,2023-02-15 21:09:07.043747
H1-0_CLUST,H1-0,clust,7420687855,data/seqs/var/H1-0_CLUST.fasta,2023-02-15 21:09:07.043747
H1-1_WT,H1-1,wt,7420687855,data/seqs/var/H1-1_WT.fasta,2023-02-15 21:09:07.043747
H1-1_RAND,H1-1,rand,7420687855,data/seqs/var/H1-1_RAND.fasta,2023-02-15 21:09:07.043747
H1-1_CLUST,H1-1,clust,7420687855,data/seqs/var/H1-1_CLUST.fasta,2023-02-15 21:09:07.043747
H1-2_WT,H1-2,wt,7420687855,data/seqs/var/H1-2_WT.fasta,2023-02-15 21:09:07.043747
H1-2_RAND,H1-2,rand,7420687855,data/seqs/var/H1-2_RAND.fasta,2023-02-15 21:09:07.043747
H1-2_CLUST,H1-2,clust,7420687855,data/seqs/var/H1-2_CLUST.fasta,2023-02-15 21:09:07.043747
H1-3_WT,H1-3,wt,7420687855,data/seqs/var/H1-3_WT.fasta,2023-02-15 21:09:07.043747
