In [1]:
import pandas as pd
import numpy as np

import os
import pathlib
import importlib

from sklearn.model_selection import train_test_split

In [25]:
def create_dir_if_not_exists(path):
    if not pathlib.Path(path).exists():
        os.mkdir(path)

def create_train_val_test_dir(path):
    for dir in ['train', 'val', 'test']:
        create_dir_if_not_exists('{}/{}'.format(path, dir))

In [32]:
# Directory containing the OAS files
OAS_FILE_DIR = r'../../datasets/OAS_sample'

DATASET_DIRECTORY = r'../datasets/'
create_dir_if_not_exists(DATASET_DIRECTORY)

# Directory to store the "raw" sequences
SEQUENCES_DIRECTORY = '{}/sequences'.format(DATASET_DIRECTORY)
create_dir_if_not_exists(SEQUENCES_DIRECTORY)

PAIRED_SEQUENCES_FILE = '{}/sequences.csv'.format(SEQUENCES_DIRECTORY)
ONLY_REPRESENTATIVE = '{}/representative.csv'.format(SEQUENCES_DIRECTORY)

SEQUENCES_TRAIN = '{}/train.csv'.format(SEQUENCES_DIRECTORY)
SEQUENCES_VAL = '{}/val.csv'.format(SEQUENCES_DIRECTORY)
SEQUENCES_TEST = '{}/test.csv'.format(SEQUENCES_DIRECTORY)

SEQUENCES_FILES = {
    'train': SEQUENCES_TRAIN,
    'val': SEQUENCES_VAL,
    'test': SEQUENCES_TEST
}

# Directory to store the fasta files
FASTA_DIRECTORY = '{}/fasta'.format(DATASET_DIRECTORY)
create_dir_if_not_exists(FASTA_DIRECTORY)

# Directory to store the clustering files
CLUSTERING_DIRECTORY = '{}/clustering'.format(DATASET_DIRECTORY)
create_dir_if_not_exists(CLUSTERING_DIRECTORY)

CLASSIFICATOR_DIR = '{}/classificator'.format(DATASET_DIRECTORY)
create_dir_if_not_exists(CLASSIFICATOR_DIR)

RANDOM_DATASET_DIR = '{}/random'.format(CLASSIFICATOR_DIR)
create_dir_if_not_exists(RANDOM_DATASET_DIR)
create_train_val_test_dir(RANDOM_DATASET_DIR)

GERMLINE_ALL_DIR = '{}/germline_all'.format(CLASSIFICATOR_DIR)
create_dir_if_not_exists(GERMLINE_ALL_DIR)
create_train_val_test_dir(GERMLINE_ALL_DIR)

GERMLINE_V_DIR = '{}/germline_v'.format(CLASSIFICATOR_DIR)
create_dir_if_not_exists(GERMLINE_V_DIR)
create_train_val_test_dir(GERMLINE_V_DIR)

# Directory to store the data for additional tests
TEST_DIRECTORY = '{}/test'.format(DATASET_DIRECTORY)
create_dir_if_not_exists(TEST_DIRECTORY)

### Read raw data

In [3]:
import read_raw

In [4]:
WHICH_GERMLINE = 'all'
STORE_SPECIE = False
SUBSAMPLE = None
ONLY_HUMAN = True

In [5]:
COMPUTE_NEW = True

In [6]:
if COMPUTE_NEW:
    importlib.reload(read_raw)
    read_raw.read_raw(OAS_FILE_DIR, PAIRED_SEQUENCES_FILE, 
                      subsample=SUBSAMPLE, only_human=ONLY_HUMAN,
                      which_germline=WHICH_GERMLINE, store_specie=STORE_SPECIE)

Reading data...
Found 16 files.
human:                 14
rat_SD:                 2


100%|██████████| 14/14 [00:18<00:00,  1.32s/it]


Filtering the data
Initial number of rows: 87116
Removed 50301 rows (-57.740%), new number of rows: 36815.
Assining ids...
Number of unique heavy: 36555
Number of unique light: 28518
Number of unique pairs:  36813
Cleaning the germlines...
Saved: ../datasets//sequences/sequences.csv


### Clusterize the sequences

In [7]:
import generate_fasta

In [8]:
WHICH = 'both'

if WHICH != 'both':
    FASTA_SEQUENCES = 'sequences_{}.fasta'.format(WHICH)
else:
    FASTA_SEQUENCES = 'sequences.fasta'

In [9]:
COMPUTE_NEW = True

In [10]:
if COMPUTE_NEW:
    importlib.reload(generate_fasta)
    generate_fasta.generate_fasta(PAIRED_SEQUENCES_FILE, 
                                  '{}/{}'.format(FASTA_DIRECTORY, FASTA_SEQUENCES))

100%|██████████| 36814/36814 [00:04<00:00, 8893.07it/s]

Saved: ../datasets//fasta/sequences.fasta





In [11]:
MIN_SEQ_ID = 0.8

commands = 'source cluster.sh {} {} {} {}\n'.format(
    DATASET_DIRECTORY, 
    'fasta/{}'.format(FASTA_SEQUENCES), 
    'clustering/sequences', 
    MIN_SEQ_ID
)

commands += 'rm -rf {}/fasta_files'.format(DATASET_DIRECTORY)

with open('clustering_commands.sh', 'w') as f:
    f.write(commands)

In [12]:
os.system('chmod +x clustering_commands.sh')
os.system('bash ./clustering_commands.sh')

createdb ../datasets//fasta/sequences.fasta ../datasets//DB/DB 

MMseqs Version:       	62975ca936b912083c2218e4e30ad962901cbb3b
Database type         	0
Shuffle input database	true
Createdb mode         	0
Write lookup file     	1
Offset of numeric ids 	0
Compressed            	0
Verbosity             	3

Converting sequences
[===
Time for merging to DB_h: 0h 0m 0s 15ms
Time for merging to DB: 0h 0m 0s 23ms
Database type: Aminoacid
Time for processing: 0h 0m 0s 228ms
Create directory ../datasets//cluster/tmp
linclust ../datasets//DB/DB ../datasets//cluster/cluster ../datasets//cluster/tmp --min-seq-id 0.8 

MMseqs Version:                     	62975ca936b912083c2218e4e30ad962901cbb3b
Cluster mode                        	0
Max connected component depth       	1000
Similarity type                     	2
Threads                             	4
Compressed                          	0
Verbosity                           	3
Weight file name                    	
Cluster Weight threshold       

0

### Get only representative

In [13]:
import get_representative

In [14]:
COMPUTE_NEW = True

In [15]:
if COMPUTE_NEW:
    importlib.reload(get_representative)
    get_representative.get_representative('{}/sequences.tsv'.format(CLUSTERING_DIRECTORY), 
                                          PAIRED_SEQUENCES_FILE, ONLY_REPRESENTATIVE)

Extracting only the representative sequences...
Saved: ../datasets//sequences/representative.csv


### Split in train, val and test

In [16]:
import split

In [17]:
COMPUTE_NEW = True

In [18]:
if COMPUTE_NEW:
    importlib.reload(split)
    split.split(ONLY_REPRESENTATIVE, SEQUENCES_TRAIN, SEQUENCES_VAL, SEQUENCES_TEST)

Start splitting
Save training split: ../datasets//sequences/train.csv
Save validation split: ../datasets//sequences/val.csv
Save test split: ../datasets//sequences/test.csv


### Get germline files

In [28]:
import get_germlines

In [33]:
COMPUTE_NEW = True

In [35]:
if COMPUTE_NEW:
    importlib.reload(get_germlines)
    for key, value in SEQUENCES_FILES.items():
        get_germlines.get_germlines(
            value, 
            '{}/{}/{}_seq_only_v'.format(GERMLINE_V_DIR, key, key),
            '{}/{}/{}_germ_only_v'.format(GERMLINE_V_DIR, key, key),
            which='v'
        )

Get germlines id...
Number of unique heavy combinations: 7
Number of unique light combinations: 16
Number of possibile heavy and light combinations: 112
Saved: ../datasets//classificator/germline_v/train/train_seq_only_v, ../datasets//classificator/germline_v/train/train_germ_only_v
Get germlines id...
Number of unique heavy combinations: 7
Number of unique light combinations: 16
Number of possibile heavy and light combinations: 112
Saved: ../datasets//classificator/germline_v/val/val_seq_only_v, ../datasets//classificator/germline_v/val/val_germ_only_v
Get germlines id...
Number of unique heavy combinations: 7
Number of unique light combinations: 17
Number of possibile heavy and light combinations: 119
Saved: ../datasets//classificator/germline_v/test/test_seq_only_v, ../datasets//classificator/germline_v/test/test_germ_only_v


In [36]:
if COMPUTE_NEW:
    importlib.reload(get_germlines)
    for key, value in SEQUENCES_FILES.items():
        get_germlines.get_germlines(
            value, 
            '{}/{}/{}_seq_all'.format(GERMLINE_ALL_DIR, key, key),
            '{}/{}/{}_germ_all'.format(GERMLINE_ALL_DIR, key, key),
            which='all'
        )

Get germlines id...
Number of unique heavy combinations: 220
Number of unique light combinations: 59
Number of possibile heavy and light combinations: 12980
Saved: ../datasets//classificator/germline_all/train/train_seq_all, ../datasets//classificator/germline_all/train/train_germ_all
Get germlines id...
Number of unique heavy combinations: 205
Number of unique light combinations: 57
Number of possibile heavy and light combinations: 11685
Saved: ../datasets//classificator/germline_all/val/val_seq_all, ../datasets//classificator/germline_all/val/val_germ_all
Get germlines id...
Number of unique heavy combinations: 235
Number of unique light combinations: 62
Number of possibile heavy and light combinations: 14570
Saved: ../datasets//classificator/germline_all/test/test_seq_all, ../datasets//classificator/germline_all/test/test_germ_all


### Germline pairing

In [37]:
import germline_pairing

In [38]:
ALPHA = 1000
NUMBER = None

In [32]:
COMPUTE_NEW = True

In [36]:
if COMPUTE_NEW:
    importlib.reload(germline_pairing)
    germline_pairing.germline_pairing('{}/train_seq_only_v.csv'.format(TRAIN_DIR), 
                                      '{}/train-germline_pairing-alpha_{}_only_v.csv'.format(TRAIN_DIR, ALPHA),
                                      '{}/train_germ_only_v.csv'.format(TRAIN_DIR),
                                      NUMBER, ALPHA)

13 out of 126 of only zeros


100%|█████████████████████████████████████████| 13/13 [00:00<00:00, 1124.04it/s]
100%|████████████████████████████████████████| 13/13 [00:00<00:00, 55809.57it/s]
100%|██████████████████████████████████████████| 13/13 [00:00<00:00, 141.41it/s]


Retrieve sequences


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 30.66it/s]


Saved: ../datasets/new/classificator/train/train-germline_pairing-alpha_1000_only_v.csv


In [37]:
if COMPUTE_NEW:
    importlib.reload(germline_pairing)
    germline_pairing.germline_pairing('{}/val_seq_only_v.csv'.format(VAL_DIR), 
                                      '{}/val-germline_pairing-alpha_{}_only_v.csv'.format(VAL_DIR, ALPHA),
                                      '{}/val_germ_only_v.csv'.format(VAL_DIR),
                                      NUMBER, ALPHA)

7 out of 119 of only zeros


100%|███████████████████████████████████████████| 7/7 [00:00<00:00, 1049.44it/s]
100%|██████████████████████████████████████████| 7/7 [00:00<00:00, 34419.85it/s]
100%|████████████████████████████████████████████| 7/7 [00:00<00:00, 198.24it/s]

Retrieve sequences



100%|█████████████████████████████████████████████| 7/7 [00:00<00:00, 46.48it/s]


Saved: ../datasets/new/classificator/val/val-germline_pairing-alpha_1000_only_v.csv


In [40]:
if COMPUTE_NEW:
    importlib.reload(germline_pairing)
    germline_pairing.germline_pairing('{}/test_seq_only_v.csv'.format(TEST_DIR), 
                                      '{}/test-germline_pairing-alpha_{}_only_v.csv'.format(TEST_DIR, ALPHA),
                                      '{}/test_germ_only_v.csv'.format(TEST_DIR),
                                      NUMBER, ALPHA)

7 out of 119 of only zeros


100%|███████████████████████████████████████████| 7/7 [00:00<00:00, 1008.63it/s]
100%|██████████████████████████████████████████| 7/7 [00:00<00:00, 37211.82it/s]
100%|████████████████████████████████████████████| 7/7 [00:00<00:00, 145.85it/s]


Retrieve sequences


100%|█████████████████████████████████████████████| 7/7 [00:00<00:00, 33.72it/s]


Saved: ../datasets/new/classificator/test/test-germline_pairing-alpha_1000_only_v.csv


### Random pairing

In [3]:
import random_pairing

In [4]:
NUMBER = None

In [5]:
COMPUTE_NEW = True

In [10]:
if COMPUTE_NEW:
    importlib.reload(random_pairing)
    random_pairing.random_pairing('{}/classificator/train/train_seq.csv'.format(DATASET_DIRECTORY),
                                 '{}/classificator/train_random/train_random.csv'.format(DATASET_DIRECTORY), 
                                  NUMBER)

Sampling 716325 random pairs
Saved: ../datasets/new/classificator/train_random/train_random.csv


In [11]:
if COMPUTE_NEW:
    importlib.reload(random_pairing)
    random_pairing.random_pairing('{}/classificator/val/val_seq.csv'.format(DATASET_DIRECTORY),
                                 '{}/classificator/val_random/val_random.csv'.format(DATASET_DIRECTORY), 
                                  NUMBER)

Sampling 271233 random pairs
Saved: ../datasets/new/classificator/val_random/val_random.csv


In [12]:
if COMPUTE_NEW:
    importlib.reload(random_pairing)
    random_pairing.random_pairing('{}/classificator/test/test_seq.csv'.format(DATASET_DIRECTORY),
                                 '{}/classificator/test_random/test_random.csv'.format(DATASET_DIRECTORY), 
                                  NUMBER)

Sampling 369597 random pairs
Saved: ../datasets/new/classificator/test_random/test_random.csv


### Merge positive and negative

In [None]:
import merge

In [None]:
COMPUTE_NEW = True

In [None]:
# Random dataset
if COMPUTE_NEW:
    importlib.reload(merge)
    merge.merge('{}/classificator/train/train_seq.csv'.format(DATASET_DIRECTORY),
                '{}/classificator/train_random/train_random.csv'.format(DATASET_DIRECTORY),
                '{}/classificator/train_random/train.csv'.format(DATASET_DIRECTORY))
    merge.merge('{}/classificator/val/val_seq.csv'.format(DATASET_DIRECTORY),
                '{}/classificator/val_random/val_random.csv'.format(DATASET_DIRECTORY),
                '{}/classificator/val_random/val.csv'.format(DATASET_DIRECTORY))
    merge.merge('{}/classificator/test/test_seq.csv'.format(DATASET_DIRECTORY),
                '{}/classificator/test_random/test_random.csv'.format(DATASET_DIRECTORY),
                '{}/classificator/test_random/test.csv'.format(DATASET_DIRECTORY))

In [None]:
# Germline only V dataset
if COMPUTE_NEW:
    importlib.reload(merge)
    merge.merge('{}/classificator/train/train_seq.csv'.format(DATASET_DIRECTORY),
                '{}/classificator/train_random/train_random.csv'.format(DATASET_DIRECTORY),
                '{}/classificator/train_random/train.csv'.format(DATASET_DIRECTORY))
    merge.merge('{}/classificator/val/val_seq.csv'.format(DATASET_DIRECTORY),
                '{}/classificator/val_random/val_random.csv'.format(DATASET_DIRECTORY),
                '{}/classificator/val_random/val.csv'.format(DATASET_DIRECTORY))
    merge.merge('{}/classificator/test/test_seq.csv'.format(DATASET_DIRECTORY),
                '{}/classificator/test_random/test_random.csv'.format(DATASET_DIRECTORY),
                '{}/classificator/test_random/test.csv'.format(DATASET_DIRECTORY))

In [None]:
# Germline dataset
if COMPUTE_NEW:
    importlib.reload(merge)
    merge.merge('{}/classificator/train/train_seq.csv'.format(DATASET_DIRECTORY),
                '{}/classificator/train_random/train_random.csv'.format(DATASET_DIRECTORY),
                '{}/classificator/train_random/train.csv'.format(DATASET_DIRECTORY))
    merge.merge('{}/classificator/val/val_seq.csv'.format(DATASET_DIRECTORY),
                '{}/classificator/val_random/val_random.csv'.format(DATASET_DIRECTORY),
                '{}/classificator/val_random/val.csv'.format(DATASET_DIRECTORY))
    merge.merge('{}/classificator/test/test_seq.csv'.format(DATASET_DIRECTORY),
                '{}/classificator/test_random/test_random.csv'.format(DATASET_DIRECTORY),
                '{}/classificator/test_random/test.csv'.format(DATASET_DIRECTORY))

### Test dataset

In [3]:
test_df = pd.read_csv(SEQUENCES_VAL, index_col=0)
light_sequences = test_df[['pair_id', 'light_id', 'light']].drop_duplicates().reset_index(drop=True)
light_sequences = light_sequences.sample(len(light_sequences))

In [4]:
import Levenshtein
from tqdm import tqdm
from itertools import combinations
from Bio import Align

from matplotlib import pyplot as plt

In [5]:
ls = light_sequences['light'].drop_duplicates().sample(10000).to_numpy()
sim = []
for x, y in combinations(ls, 2):
    sim.append(Levenshtein.ratio(x, y))

In [6]:
print(np.mean(sim), np.std(sim), np.min(sim), np.max(sim))

0.6231716007187188 0.11292343526922217 0.3677130044843049 0.9956331877729258


In [7]:
def create_pairs(light_sequences, threshold, sim_func):
    pairs = []
    index = 0
    for _, r in tqdm(light_sequences.iterrows(), total=len(light_sequences)):
        found = False
        #print('searching a fella for seq', r['light_id'])
        while not found:
            sim = sim_func(r['light'], light_sequences.iloc[index, 2])
            if sim < threshold:
                found = True
                pairs.append((r['pair_id'], r['light_id'], light_sequences.iloc[index, 1]))
            index += 1
            if index == len(light_sequences): 
                index = 0
        #pairs.append((r['light_id'], light_sequences.iloc[index, 0]))
    return pairs

def create_pairs_random(light_sequences):
    pairs = []
    sampled = light_sequences.sample(len(light_sequences))
    for (_, r1), (_, r2) in zip(light_sequences.iterrows(), sampled.iterrows()):
        print(r1, r2)
    
        
        
pairs_list = create_pairs(light_sequences, np.mean(sim) - np.std(sim), Levenshtein.ratio)

#create_pairs_random(light_sequences)

100%|██████████████████████████████████| 271230/271230 [06:35<00:00, 685.57it/s]


In [8]:
pairs = pd.DataFrame({
    'pair_id': [x for x, _, _ in pairs_list],
    'positive_light': [x for _, x, _ in pairs_list],
    'negative_light': [x for _, _, x in pairs_list]
})

In [9]:
df_pos = pd.merge(light_sequences[['light_id', 'light']].drop_duplicates(), 
                  pairs, 
                  left_on='light_id', right_on='positive_light', how='right').rename({
    'light_id': 'light_id_pos',
    'light': 'light_pos'
}, axis=1)[['pair_id', 'light_id_pos', 'light_pos']]

In [10]:
df_neg = pd.merge(light_sequences[['light_id', 'light']].drop_duplicates(), 
                  pairs, 
                  left_on='light_id', right_on='negative_light', how='right').rename({
    'light_id': 'light_id_neg',
    'light': 'light_neg'
}, axis=1)[['pair_id', 'light_id_neg', 'light_neg']]

In [11]:
df = pd.merge(df_pos, df_neg)
df = test_df[['pair_id', 'heavy_id', 'heavy']].merge(df)
df.to_csv('{}/val.csv'.format(TEST_DIRECTORY))