# Data preparation

### Setup

In [8]:
import pandas as pd
import json
import itertools
import random
import shutil
from datetime import datetime
from src import process_data

### Downloading and preprocessing data

In [None]:
# Loading seqs.json
json_seqs_path = 'data/seqs/seqs.json'
seqs = pd.read_json(json_seqs_path, orient='index')

# Downloading data
seqs['raw_data'] = seqs.apply(lambda row: process_data.get_protein_gp(acc_num=row['accession_number'], filename=row.name), axis=1)

# Extracting longest IDRs
applied = seqs.apply(lambda row: process_data.extract_idr_fasta(gp_path=row['raw_data'], i_idr=0, length_order=True), axis=1, result_type='expand')
seqs['idr_data'] = applied.iloc[:,0]
seqs['idr_location'] = applied.iloc[:,1]

# Updating metadata in seqs.json (Note: df.to_json formats '/' in paths as '\/')
seqs['timestamp'] = str(datetime.now())
with open (json_seqs_path, 'w') as file:
    json.dump(seqs.to_dict('index'), file, indent=4)
seqs

### Generating variants

In [14]:
# Loading seqs.json
json_seqs_path = 'data/seqs/seqs.json'
seqs = pd.read_json(json_seqs_path, orient='index')

# Resetting variants
json_vars_path = 'data/seqs/vars.json'
try:
    shutil.rmtree('data/seqs/var/', )
except FileNotFoundError:
    pass

# Preparing variant dataframe
variants = pd.DataFrame(list(itertools.product(seqs.index, process_data.variant_types)), columns=['template', 'variant'])
variants.index = variants.apply(lambda row: row['template'] + '_' + row['variant'].upper(), axis=1)
variants['seed'] = random.randint(0, 1e10)

# Generating variants
variants["var_data"] = variants.apply(lambda row: process_data.generate_variant_fasta(fasta_path=seqs['idr_data'][row['template']], variant=row['variant'], filename=row.name, seed=row['seed']), axis=1)

# Updating metadata in seqs.json (Note: df.to_json formats '/' in paths as '\/')
variants['timestamp'] = str(datetime.now())
with open (json_vars_path, 'w') as file:
    json.dump(variants.to_dict('index'), file, indent=4)
variants

Unnamed: 0,template,variant,seed,var_data,timestamp
H1-0_WT,H1-0,wt,834744962,data/seqs/var/H1-0_WT.fasta,2023-02-15 10:55:56.496122
H1-0_RAND,H1-0,rand,834744962,data/seqs/var/H1-0_RAND.fasta,2023-02-15 10:55:56.496122
H1-0_CLUST,H1-0,clust,834744962,data/seqs/var/H1-0_CLUST.fasta,2023-02-15 10:55:56.496122
H1-1_WT,H1-1,wt,834744962,data/seqs/var/H1-1_WT.fasta,2023-02-15 10:55:56.496122
H1-1_RAND,H1-1,rand,834744962,data/seqs/var/H1-1_RAND.fasta,2023-02-15 10:55:56.496122
H1-1_CLUST,H1-1,clust,834744962,data/seqs/var/H1-1_CLUST.fasta,2023-02-15 10:55:56.496122
H1-2_WT,H1-2,wt,834744962,data/seqs/var/H1-2_WT.fasta,2023-02-15 10:55:56.496122
H1-2_RAND,H1-2,rand,834744962,data/seqs/var/H1-2_RAND.fasta,2023-02-15 10:55:56.496122
H1-2_CLUST,H1-2,clust,834744962,data/seqs/var/H1-2_CLUST.fasta,2023-02-15 10:55:56.496122
H1-3_WT,H1-3,wt,834744962,data/seqs/var/H1-3_WT.fasta,2023-02-15 10:55:56.496122
