In [1]:
import random

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import numpy as np
import pandas as pd

from scripts.general import preprocess

### Load augmented abundaces 

In [2]:
aug_abund = pd.read_csv('../20211223_105853/data/abundance.csv')

### Add counter-examples to the protein sequences

Same number of counter-examples as number of experiments for a protein

In [3]:
counterexample_counts = aug_abund.groupby('swissprot_ac')['experiment'].count().reset_index()
counterexample_counts = dict(counterexample_counts.values)

In [4]:
random.seed(42)

seq = preprocess.fasta_to_seq_df(
    '../../data/seq/scerevisiae_aminoacid_uniprot_20200120_seqlen_100_to_1000.fasta'
)

seq = seq[seq['swissprot_ac'].isin(aug_abund['swissprot_ac'].values)]

In [5]:
def generate_shuffles(prot_id:str, seq: str) -> pd.DataFrame:
    n_rands = range(counterexample_counts[prot_id])
    shuffled_seq = [''.join(random.sample(seq, k=len(seq))) for _ in n_rands]
    
    return pd.DataFrame.from_records(
        zip([prot_id] + [f'{prot_id}_shuffled_{i}' for i in n_rands],
            [seq] + shuffled_seq),
        columns=['swissprot_ac', 'seq']
    )    

seq_and_counters = pd.concat(
    seq.apply(lambda row: generate_shuffles(row['swissprot_ac'], row['seq']),
              axis='columns').values,
    ignore_index=True
)

### Set zeroe abundance for counter-examples

In [6]:
aug_abund = (
    pd.merge(aug_abund, seq_and_counters['swissprot_ac'], how='outer')
    .fillna(1e-5)
)

In [7]:
aug_abund.tail()

Unnamed: 0,Systematic_Name,swissprot_ac,experiment,Median_molecules_per_cell
101409,1e-05,P43574_shuffled_5,1e-05,1e-05
101410,1e-05,P43574_shuffled_6,1e-05,1e-05
101411,1e-05,P43574_shuffled_7,1e-05,1e-05
101412,1e-05,P43574_shuffled_8,1e-05,1e-05
101413,1e-05,P43574_shuffled_9,1e-05,1e-05


### Write files

In [8]:
! cat data/config.yaml

data_root: '~/projects/DeepTranslation'
protein_sequence: 'results/20211223_182228/data/augmented_sequences_with_counterexamples.fasta'
protein_abundance: 'results/20211223_182228/data/augmented_abundance_with_counterexamples.csv'


In [9]:
def write_df_to_fasta(seq_df: pd.DataFrame, id_col: str, seq_col, fasta_fname: str):
    fasta_records = []
    for _, entry in seq_df.iterrows():
        rec = SeqRecord(id = entry[id_col], seq = Seq(entry[seq_col]), description = '')
        fasta_records.append(rec)

    with open(fasta_fname, 'w') as fout:
        SeqIO.write(fasta_records, fout, 'fasta')

seq_and_counters['swissprot_ac'] = seq_and_counters['swissprot_ac'].map(
    lambda prot_id: f'sp|{prot_id}|seq_and_counters'
)
write_df_to_fasta(seq_and_counters, id_col='swissprot_ac', seq_col='seq',
                  fasta_fname='data/augmented_sequences_with_counterexamples.fasta')

aug_abund.to_csv('data/augmented_abundance_with_counterexamples.csv', index=False)

## Test preprocessing

In [10]:
import os
from pathlib import Path
import yaml

from scripts.general import preprocess

def prepare_data(data_path: str) -> tuple:
    with (Path(data_path) / 'config.yaml').open('r') as config_file:
        config = yaml.load(config_file, Loader=yaml.FullLoader)
    data_root = os.path.expanduser(config['data_root'])
    fasta_fname = Path(data_root) / config['protein_sequence']
    abundances_fname = Path(data_root) / config['protein_abundance']
    seq_and_abundances = pd.merge(preprocess.fasta_to_seq_df(fasta_fname),
                                  pd.read_csv(abundances_fname),
                                  on='swissprot_ac')
#     seq_and_abundances = seq_and_abundances[['seq', 'Median_molecules_per_cell']]
    return seq_and_abundances

input_data = prepare_data('data')

In [11]:
input_data.sort_values('swissprot_ac')

Unnamed: 0,swissprot_ac,seq,Systematic_Name,experiment,Median_molecules_per_cell
12474,D6VTK4,MSDAAPSLSNLFYDPTYNPGQSTINYTSIYGNGSTITFDELQGLVN...,YFL026W,CHO,9858.00000
12472,D6VTK4,MSDAAPSLSNLFYDPTYNPGQSTINYTSIYGNGSTITFDELQGLVN...,YFL026W,BRE,6538.00000
12473,D6VTK4,MSDAAPSLSNLFYDPTYNPGQSTINYTSIYGNGSTITFDELQGLVN...,YFL026W,MAZ,3069.00000
12469,D6VTK4,MSDAAPSLSNLFYDPTYNPGQSTINYTSIYGNGSTITFDELQGLVN...,YFL026W,LEE2,6367.00000
12475,D6VTK4,MSDAAPSLSNLFYDPTYNPGQSTINYTSIYGNGSTITFDELQGLVN...,YFL026W,NEW,13952.00000
...,...,...,...,...,...
55897,Q9URQ3_shuffled_3,ILLKDLGREKVYMLTTPVVWQSWIELMRWDDKQVLINKVVGVNIAV...,1e-05,1e-05,0.00001
55898,Q9URQ3_shuffled_4,KRPSVSELDNKVPDNWMSIPKVRSGWNRNECSLKSPNPCDEDSMKA...,1e-05,1e-05,0.00001
55899,Q9URQ3_shuffled_5,LFSCSFGVLIGIKADSDLYIDVEEDKVCDLVWQPRLPNLADNLESE...,1e-05,1e-05,0.00001
73748,Q9ZZW7,MAFRKSNVYLSLVNSYIIDSPQPSSINYWWNMGSLLGLCLVIQIVT...,Q0115,PENG,183.00000
