In [15]:
import pandas as pd
import numpy as np
import re
df = pd.read_csv('uniprot_experimental.tsv', sep='\t')

In [48]:
def cleanup_act_site(raw_string: str) -> list:
    matches = re.findall('ACT_SITE [0-9]{1,4}', raw_string)
    return [int(x.lstrip('ACT_SITE ')) -1  for x in matches] #-1 correct 0 indexing
    

In [49]:
site_indices = df['Active site'].apply(lambda x: cleanup_act_site(x))
sequences = df['Sequence']

In [56]:
tag_sequences = []
for sequence, sites in zip(sequences, site_indices):
    tag_seq = ['N'] * len(sequence)
    for i in sites:
        tag_seq[i] = 'A'
        
    tag_seq = ''.join(tag_seq)
    tag_sequences.append(tag_seq)
    

In [161]:
tags = pd.Series(tag_sequences, name='Tags')

pd.DataFrame([sequences, tags]).T.to_csv('active_site_tagged_sequences_experimental.tsv', sep ='\t', index = False)

df['Tags'] = tags

## Homology partitioning and train-test-val split

In [None]:
!mmseqs easy-cluster uniprot_experimental.fasta clusterres/ tmp -a --min-seq-id 0.2 -s 6 --cov-mode 1 -c 0.8 --cluster-mode 2 --max-seqs 2000 --threads 12

In [224]:
#sort cluster assignments and raw data to align
clusters = pd.read_csv('clusterres/_cluster.tsv', sep ='\t', header = None)
clusters = clusters.sort_values(1).reset_index(drop = True)
df = df.sort_values('Entry').reset_index(drop = True)

In [229]:
#make splits
from sklearn.model_selection import GroupShuffleSplit
gss = GroupShuffleSplit(n_splits=1, train_size=.8, random_state=42)
splits = gss.split(df, groups= clusters[0])

tr_idx, test_idx = list(splits)[0]
train_df = df.loc[tr_idx].reset_index(drop=True)
test_df = df.loc[test_idx].reset_index(drop=True)
test_df[['Sequence', 'Tags']].to_csv(f'test.tsv',sep='\t', index=False)
train_clusters = clusters.loc[tr_idx].reset_index(drop=True)


#crossvalidation splits
gss = GroupShuffleSplit(n_splits=5, train_size=.8, random_state=42)
splits = gss.split(train_df, groups= train_clusters[0])

for i, (train_idx, test_idx) in enumerate(splits):
    train_df[['Sequence', 'Tags']].loc[train_idx].to_csv(f'train_{i}.tsv',sep='\t', index=False)
    train_df[['Sequence', 'Tags']].loc[test_idx].to_csv(f'valid_{i}.tsv',sep='\t', index=False)

## Test dataloading before moving to modeling

In [233]:
import sys
import torch
sys.path.append('../..')
from train_scripts.training_utils import SequenceTaggingDataset

In [72]:
ds = SequenceTaggingDataset(data_file='active_site_tagged_sequences_experimental.tsv', label_dict= {'N':0, 'A':1} )

In [234]:
dl = torch.utils.data.DataLoader(ds, collate_fn=ds.collate_fn, batch_size =5)
iterator = iter(dl)

In [236]:
data, targets, mask = next(iterator)

(147,)
(147,)
(290,)
(290,)
(780,)
(780,)
(384,)
(384,)
(430,)
(430,)


In [258]:
mask

tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        ...,
        [0, 0, 1, 0, 0],
        [0, 0, 1, 0, 0],
        [0, 0, 1, 0, 0]])

In [257]:
probs = torch.tensor([0,  1.0,1.0,0,0])
targets = torch.tensor([0,1,1,1,1]).float()
torch.nn.functional.binary_cross_entropy(probs, targets, weight = torch.tensor([1,1,1,0,0]).float())

tensor(0.)