In [14]:
import pandas as pd
import numpy as np

import os
import pathlib
import importlib

from sklearn.model_selection import train_test_split

In [15]:
OAS_FILE_DIR = r'../../datasets/OAS'
DATASET_DIRECTORY = '../datasets/new'

# Directory to store the "raw" sequences
SEQUENCES_DIRECTORY = '{}/sequences'.format(DATASET_DIRECTORY)

if not pathlib.Path(SEQUENCES_DIRECTORY).exists():
    os.mkdir(SEQUENCES_DIRECTORY)

PAIRED_SEQUENCES_FILE = '{}/sequences.csv'.format(SEQUENCES_DIRECTORY)
ONLY_REPRESENTATIVE = '{}/representative.csv'.format(SEQUENCES_DIRECTORY)
SEQUENCES_TRAIN = '{}/train.csv'.format(SEQUENCES_DIRECTORY)
SEQUENCES_VAL = '{}/val.csv'.format(SEQUENCES_DIRECTORY)
SEQUENCES_TEST = '{}/test.csv'.format(SEQUENCES_DIRECTORY)

# Directory to store the fasta files
FASTA_DIRECTORY = '{}/fasta'.format(DATASET_DIRECTORY)
if not pathlib.Path(FASTA_DIRECTORY).exists():
    os.mkdir(FASTA_DIRECTORY)

# Directory to store the clustering files
CLUSTERING_DIRECTORY = '{}/clustering'.format(DATASET_DIRECTORY)
if not pathlib.Path(CLUSTERING_DIRECTORY).exists():
    os.makedirs(CLUSTERING_DIRECTORY)


CLASSIFICATOR_DIR = '{}/classificator'.format(DATASET_DIRECTORY)
if not pathlib.Path(CLASSIFICATOR_DIR).exists():
    os.makedirs(CLASSIFICATOR_DIR)

TRAIN_DIR = '{}/train'.format(CLASSIFICATOR_DIR)
if not pathlib.Path(TRAIN_DIR).exists():
    os.makedirs(TRAIN_DIR)

VAL_DIR = '{}/val'.format(CLASSIFICATOR_DIR)
if not pathlib.Path(VAL_DIR).exists():
    os.makedirs(VAL_DIR)

TEST_DIR = '{}/test'.format(CLASSIFICATOR_DIR)
if not pathlib.Path(TEST_DIR).exists():
    os.makedirs(TEST_DIR)


TEST_DIRECTORY = '{}/test'.format(DATASET_DIRECTORY)

if not pathlib.Path(TEST_DIRECTORY).exists():
    os.makedirs(TEST_DIRECTORY)

### Read raw data

In [6]:
import read_raw

In [7]:
WHICH_GERMLINE = 'v'
STORE_SPECIE = False
SUBSAMPLE = None
ONLY_HUMAN = True

In [9]:
COMPUTE_NEW = False

In [10]:
if COMPUTE_NEW:
    importlib.reload(read_raw)
    read_raw.read_raw(OAS_FILE_DIR, PAIRED_SEQUENCES_FILE, 
                      subsample=SUBSAMPLE, only_human=ONLY_HUMAN,
                      which_germline=WHICH_GERMLINE, store_specie=STORE_SPECIE)

Reading data...
Found 310 files.
human:                280
mouse_C57BL/6:          2
mouse_BALB/c:           8
rat_SD:                20


100%|█████████████████████████████████████████| 280/280 [04:47<00:00,  1.03s/it]


Filtering the data
Initial number of rows: 1954079
Removed 279826 rows (-14.320%), new number of rows: 1674253.
Assining ids...
Number of unique heavy: 1654917
Number of unique light: 724832
Number of unique pairs:  1674177
Cleaning the germlines...
Saved: ../datasets/new_only_v/sequences/sequences.csv


### Clusterize the sequences

In [11]:
import generate_fasta

In [12]:
WHICH = 'both'

if WHICH != 'both':
    FASTA_SEQUENCES = 'sequences_{}.fasta'.format(WHICH)
else:
    FASTA_SEQUENCES = 'sequences.fasta'

In [13]:
COMPUTE_NEW = True

In [14]:
if COMPUTE_NEW:
    importlib.reload(generate_fasta)
    generate_fasta.generate_fasta(PAIRED_SEQUENCES_FILE, 
                                  '{}/{}'.format(FASTA_DIRECTORY, FASTA_SEQUENCES))

100%|██████████████████████████████| 1674177/1674177 [01:13<00:00, 22781.21it/s]


Saved: ../datasets/new_only_v/fasta/sequences.fasta


In [15]:
MIN_SEQ_ID = 0.8

commands = 'source cluster.sh {} {} {} {}\n'.format(
    DATASET_DIRECTORY, 
    'fasta/{}'.format(FASTA_SEQUENCES), 
    'clustering/sequences', 
    MIN_SEQ_ID
)

commands += 'rm -rf {}/fasta_files'.format(DATASET_DIRECTORY)

with open('clustering_commands.sh', 'w') as f:
    f.write(commands)

### Get only representative

In [16]:
clusters = pd.read_csv('{}/sequences.tsv'.format(CLUSTERING_DIRECTORY), sep='\t', header=None).rename(
    {
        0: 'representative',
        1: 'sequences'
    },
    axis=1
)

In [17]:
cluster_sizes = {
    'representative': [],
    'size': []
}

for seq, data in clusters.groupby('representative'):
    cluster_sizes['representative'].append(seq)
    cluster_sizes['size'].append(len(data))

cluster_sizes = pd.DataFrame(cluster_sizes)

In [18]:
# Nota! Ci sono delle sequenze uguali ma con germline diverse

sequences = pd.read_csv(PAIRED_SEQUENCES_FILE, index_col=0)
representative = clusters['representative'].drop_duplicates()
representative = sequences.merge(representative, left_on='pair_id', right_on='representative', how='right')[sequences.columns]

In [19]:
representative.to_csv(ONLY_REPRESENTATIVE)

### Split in train, val and test

In [19]:
def compute_germline_data(df):
    from itertools import product
    
    germlines_dict = {'heavy_germline': [], 'light_germline': [], 'counter': []}

    for heavy_germline, data in df.groupby('heavy_germline'):
        for light_germline, data2 in data.groupby('light_germline'):
            germlines_dict['heavy_germline'].append(heavy_germline)
            germlines_dict['light_germline'].append(light_germline)
            germlines_dict['counter'].append(len(data2))

    germlines = pd.DataFrame(germlines_dict)

    all_germline_pairs = set(product(germlines['heavy_germline'].unique(), germlines['light_germline'].unique()))
    found_germline_pairs = set([(row['heavy_germline'], row['light_germline']) for _, row in germlines.iterrows()])
    not_found_germline_pairs = all_germline_pairs.difference(found_germline_pairs)

    zero_germlines = {'heavy_germline': [], 'light_germline': [], 'counter': []}

    for h, l in not_found_germline_pairs:
        zero_germlines['heavy_germline'].append(h)
        zero_germlines['light_germline'].append(l)
        zero_germlines['counter'].append(0)

    germlines = pd.concat([germlines, pd.DataFrame(zero_germlines)], axis=0).sort_values(by='counter', ascending=False)

    return germlines

def split(germlines, sequences):
    # Select some heavy germlines exclusive to dataset 2
    heavy_germlines_dict = {'germline': [], 'counter': []}
    for g, data in germlines.groupby('heavy_germline'):
        heavy_germlines_dict['germline'].append(g)
        heavy_germlines_dict['counter'].append(data.sum()['counter'])
    heavy_germlines = pd.DataFrame(heavy_germlines_dict)

    heavy_germlines = heavy_germlines.sort_values(by='counter', ascending=False)

    lower = heavy_germlines['counter'].quantile(0.20)
    higher = heavy_germlines['counter'].quantile(0.80)

    selected_heavy_germlines = heavy_germlines[(heavy_germlines['counter'] > lower) & (heavy_germlines['counter'] < higher)]
    selected_heavy_germlines = selected_heavy_germlines.sample(int(len(selected_heavy_germlines)*0.1))

    # Select some light germlines exlcusive to dataset 2
    light_germlines_dict = {'germline': [], 'counter': []}
    for g, data in germlines.groupby('light_germline'):
        light_germlines_dict['germline'].append(g)
        light_germlines_dict['counter'].append(data.sum()['counter'])
    light_germlines = pd.DataFrame(light_germlines_dict)

    light_germlines = light_germlines.sort_values(by='counter', ascending=False)

    lower = light_germlines['counter'].quantile(0.20)
    higher = light_germlines['counter'].quantile(0.80)

    selected_light_germlines = light_germlines[(light_germlines['counter'] > lower) & (light_germlines['counter'] < higher)]
    selected_light_germlines = selected_light_germlines.sample(int(len(selected_light_germlines)*0.1))

    merged = sequences.merge(selected_heavy_germlines, left_on='heavy_germline', right_on='germline', how='left', indicator=True)
    merged = merged[merged['_merge'] == 'left_only'][sequences.columns]
    merged = merged.merge(selected_light_germlines, left_on='light_germline', right_on='germline', how='left', indicator=True)
    merged = merged[merged['_merge'] == 'left_only'][sequences.columns]
    to_split = merged

    df1, df2 = train_test_split(to_split, test_size=0.25, random_state=1234567890)

    df2 = pd.concat([
        df2, 
        pd.merge(sequences, selected_heavy_germlines['germline'], 
                 left_on='heavy_germline', right_on='germline')[sequences.columns],
        pd.merge(sequences, selected_light_germlines['germline'], 
                 left_on='light_germline', right_on='germline')[sequences.columns]
    ])

    df2 = df2.drop_duplicates()

    return df1, df2 

In [20]:
representative = pd.read_csv(ONLY_REPRESENTATIVE, index_col=0)

In [21]:
germlines_all = compute_germline_data(representative)
trainval, test = split(germlines_all, representative)
test.to_csv(SEQUENCES_TEST)

In [22]:
germlines_trainval = compute_germline_data(trainval)
train, val = split(germlines_trainval, trainval)
train.to_csv(SEQUENCES_TRAIN)
val.to_csv(SEQUENCES_VAL)

### Get germline files

In [22]:
import get_germlines

In [23]:
COMPUTE_NEW = False

In [27]:
if COMPUTE_NEW:
    importlib.reload(get_germlines)
    get_germlines.get_germlines(
        SEQUENCES_TRAIN, 
        '{}/train_seq_only_v.csv'.format(TRAIN_DIR),
        '{}/train_germ_only_v.csv'.format(TRAIN_DIR),
        which='v')

Get germlines id...
Number of unique heavy combinations: 7
Number of unique light combinations: 18
Number of possibile heavy and light combinations: 126
Saved: ../datasets/new/classificator/train/train_seq_only_v.csv, ../datasets/new/classificator/train/train_germ_only_v.csv


In [28]:
if COMPUTE_NEW:
    importlib.reload(get_germlines)
    get_germlines.get_germlines(
        SEQUENCES_VAL, 
        '{}/val_seq_only_v.csv'.format(VAL_DIR),
        '{}/val_germ_only_v.csv'.format(VAL_DIR),
        which='v')

Get germlines id...
Number of unique heavy combinations: 7
Number of unique light combinations: 17
Number of possibile heavy and light combinations: 119
Saved: ../datasets/new/classificator/val/val_seq_only_v.csv, ../datasets/new/classificator/val/val_germ_only_v.csv


In [39]:
if COMPUTE_NEW:
    importlib.reload(get_germlines)
    get_germlines.get_germlines(
        SEQUENCES_TEST, 
        '{}/test_seq_only_v.csv'.format(TEST_DIR),
        '{}/test_germ_only_v.csv'.format(TEST_DIR),
        which='v')

Get germlines id...
Number of unique heavy combinations: 7
Number of unique light combinations: 17
Number of possibile heavy and light combinations: 119
Saved: ../datasets/new/classificator/test/test_seq_only_v.csv, ../datasets/new/classificator/test/test_germ_only_v.csv


### Germline pairing

In [30]:
import germline_pairing

In [31]:
ALPHA = 1000
NUMBER = None

In [32]:
COMPUTE_NEW = True

In [36]:
if COMPUTE_NEW:
    importlib.reload(germline_pairing)
    germline_pairing.germline_pairing('{}/train_seq_only_v.csv'.format(TRAIN_DIR), 
                                      '{}/train-germline_pairing-alpha_{}_only_v.csv'.format(TRAIN_DIR, ALPHA),
                                      '{}/train_germ_only_v.csv'.format(TRAIN_DIR),
                                      NUMBER, ALPHA)

13 out of 126 of only zeros


100%|█████████████████████████████████████████| 13/13 [00:00<00:00, 1124.04it/s]
100%|████████████████████████████████████████| 13/13 [00:00<00:00, 55809.57it/s]
100%|██████████████████████████████████████████| 13/13 [00:00<00:00, 141.41it/s]


Retrieve sequences


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 30.66it/s]


Saved: ../datasets/new/classificator/train/train-germline_pairing-alpha_1000_only_v.csv


In [37]:
if COMPUTE_NEW:
    importlib.reload(germline_pairing)
    germline_pairing.germline_pairing('{}/val_seq_only_v.csv'.format(VAL_DIR), 
                                      '{}/val-germline_pairing-alpha_{}_only_v.csv'.format(VAL_DIR, ALPHA),
                                      '{}/val_germ_only_v.csv'.format(VAL_DIR),
                                      NUMBER, ALPHA)

7 out of 119 of only zeros


100%|███████████████████████████████████████████| 7/7 [00:00<00:00, 1049.44it/s]
100%|██████████████████████████████████████████| 7/7 [00:00<00:00, 34419.85it/s]
100%|████████████████████████████████████████████| 7/7 [00:00<00:00, 198.24it/s]

Retrieve sequences



100%|█████████████████████████████████████████████| 7/7 [00:00<00:00, 46.48it/s]


Saved: ../datasets/new/classificator/val/val-germline_pairing-alpha_1000_only_v.csv


In [40]:
if COMPUTE_NEW:
    importlib.reload(germline_pairing)
    germline_pairing.germline_pairing('{}/test_seq_only_v.csv'.format(TEST_DIR), 
                                      '{}/test-germline_pairing-alpha_{}_only_v.csv'.format(TEST_DIR, ALPHA),
                                      '{}/test_germ_only_v.csv'.format(TEST_DIR),
                                      NUMBER, ALPHA)

7 out of 119 of only zeros


100%|███████████████████████████████████████████| 7/7 [00:00<00:00, 1008.63it/s]
100%|██████████████████████████████████████████| 7/7 [00:00<00:00, 37211.82it/s]
100%|████████████████████████████████████████████| 7/7 [00:00<00:00, 145.85it/s]


Retrieve sequences


100%|█████████████████████████████████████████████| 7/7 [00:00<00:00, 33.72it/s]


Saved: ../datasets/new/classificator/test/test-germline_pairing-alpha_1000_only_v.csv


### Random pairing

In [12]:
import pandas as pd
pd.read_csv('../datasets/new/classificator/train/train-germline_pairing-alpha_1000.csv', index_col=0)

Unnamed: 0,heavy_id,light_id,pair_id,pairing_index,heavy,light
0,323767,201,0,9835,QVQLVQSGSELKKPGASVKVSCKASGYTFTSYAMNWVRQAPGQGLE...,DIVMTQSPLSLPVTPGEPASISCRSSQSLLHSNGYNYLDWYLQKPG...
1,882323,489318,1,12722,EVQLVQSGAEVKKPGESLKISCKGSGYSFTSYWIGWVRQMPGKGLE...,QAGLTQPPSISKGLRETATLTCTGNSENVGSHGAAWLQQHQGHPPK...
2,45772,16293,2,11984,QVQLQESGPGLVKPSETLSLTCTISGGSISGYYWSWIRQPPGKGLE...,DIVMTQSPLSLPVTPGEPASISCRSSQSLLHSNGYNYLDWYVQKPG...
3,404987,196781,3,12555,QVQLQQSGPGLVKPSQTLSLTCAISGDSVSSNSAAWNWIRQSPSRG...,NFMLTQPHSVSESPGKTVTISCTRSSGSIASNYVQWYQQRPGSSPT...
4,461907,79910,4,11178,QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYAMHWVRQAPGQRLE...,ETTLTQSPAFMSATPGDKVNISCKASQDIDDDMNWYQQKPGEAAIF...
...,...,...,...,...,...,...
716320,1261220,67919,702813,12226,EVQLVQSGAQVKKPGESLKISCKASGYSFTSYWIGWVRQMSGEGLE...,QSVLTQPPSVSGAPGQRVTISCTGSSSNIGAGYDVHWYQQLPGTAP...
716321,851718,490193,702814,13033,QVQVVQSGSELKEPGASVRISCRTSGYPFTTYPIHWVRQAPGHGLE...,QLVLTQSPSASASLGASVKLTCSLSRGHSSYAIAWHQQQPEKGPRY...
716322,730126,205339,702815,17200,QVTLRESGPALVKPKETLTLTCSFSGFSLSTAGMCMSWIRQPPGKA...,NFMLTQPHSVSESPGKTVTISCTRSSGSIASNYVQWYQQRPGSSPT...
716323,335218,388503,702816,16528,QVQLVQSGSELKKPGASVKVSCKASGYTFTSYAMNWVRQAPGQGLE...,QPVLTQPPSASASLGASVTLTCTLSSGYSSCNVDWYQQRPGKGPRF...


In [16]:
import random_pairing

In [17]:
NUMBER = None

In [18]:
COMPUTE_NEW = True

In [24]:
if COMPUTE_NEW:
    importlib.reload(random_pairing)
    random_pairing.random_pairing('{}/classificator/new/train/train_seq.csv'.format(DATASET_DIRECTORY),
                                  '{}/classificator/new/train_random/train_random.csv'.format(DATASET_DIRECTORY),
                                  NUMBER)

FileNotFoundError: [Errno 2] No such file or directory: '../datasets/new/classificator/new/train/train_seq.csv'

### Merge positive and negative

In [41]:
def merge_positive_and_negative(pos, neg):
    pos = pd.concat([pos[['pair_id', 'heavy', 'light']], 
                     pd.DataFrame({'class': np.zeros(len(pos), dtype=int)})],
                     axis=1)
    neg = pd.concat([neg[['pair_id', 'heavy', 'light']], 
                     pd.DataFrame({'class': np.ones(len(pos), dtype=int)})],
                     axis=1)
    data = pd.concat([pos, neg])
    return data.sample(len(data))

In [42]:
pos = pd.read_csv('{}/train_seq_only_v.csv'.format(TRAIN_DIR), index_col=0)
neg = pd.read_csv('{}/train-germline_pairing-alpha_{}_only_v.csv'.format(TRAIN_DIR, ALPHA))
data = merge_positive_and_negative(pos, neg)
data.to_csv('{}/train_only_v.csv'.format(TRAIN_DIR))

In [43]:
pos = pd.read_csv('{}/val_seq_only_v.csv'.format(VAL_DIR), index_col=0)
neg = pd.read_csv('{}/val-germline_pairing-alpha_{}_only_v.csv'.format(VAL_DIR, ALPHA))
data = merge_positive_and_negative(pos, neg)
data.to_csv('{}/val_only_v.csv'.format(VAL_DIR))

In [44]:
pos = pd.read_csv('{}/test_seq_only_v.csv'.format(TEST_DIR), index_col=0)
neg = pd.read_csv('{}/test-germline_pairing-alpha_{}_only_v.csv'.format(TEST_DIR, ALPHA))
data = merge_positive_and_negative(pos, neg)
data.to_csv('{}/test_only_v.csv'.format(TEST_DIR))

### Test dataset

In [3]:
test_df = pd.read_csv(SEQUENCES_VAL, index_col=0)
light_sequences = test_df[['pair_id', 'light_id', 'light']].drop_duplicates().reset_index(drop=True)
light_sequences = light_sequences.sample(len(light_sequences))

In [4]:
import Levenshtein
from tqdm import tqdm
from itertools import combinations
from Bio import Align

from matplotlib import pyplot as plt

In [5]:
ls = light_sequences['light'].drop_duplicates().sample(10000).to_numpy()
sim = []
for x, y in combinations(ls, 2):
    sim.append(Levenshtein.ratio(x, y))

In [6]:
print(np.mean(sim), np.std(sim), np.min(sim), np.max(sim))

0.6231716007187188 0.11292343526922217 0.3677130044843049 0.9956331877729258


In [7]:
def create_pairs(light_sequences, threshold, sim_func):
    pairs = []
    index = 0
    for _, r in tqdm(light_sequences.iterrows(), total=len(light_sequences)):
        found = False
        #print('searching a fella for seq', r['light_id'])
        while not found:
            sim = sim_func(r['light'], light_sequences.iloc[index, 2])
            if sim < threshold:
                found = True
                pairs.append((r['pair_id'], r['light_id'], light_sequences.iloc[index, 1]))
            index += 1
            if index == len(light_sequences): 
                index = 0
        #pairs.append((r['light_id'], light_sequences.iloc[index, 0]))
    return pairs

def create_pairs_random(light_sequences):
    pairs = []
    sampled = light_sequences.sample(len(light_sequences))
    for (_, r1), (_, r2) in zip(light_sequences.iterrows(), sampled.iterrows()):
        print(r1, r2)
    
        
        
pairs_list = create_pairs(light_sequences, np.mean(sim) - np.std(sim), Levenshtein.ratio)

#create_pairs_random(light_sequences)

100%|██████████████████████████████████| 271230/271230 [06:35<00:00, 685.57it/s]


In [8]:
pairs = pd.DataFrame({
    'pair_id': [x for x, _, _ in pairs_list],
    'positive_light': [x for _, x, _ in pairs_list],
    'negative_light': [x for _, _, x in pairs_list]
})

In [9]:
df_pos = pd.merge(light_sequences[['light_id', 'light']].drop_duplicates(), 
                  pairs, 
                  left_on='light_id', right_on='positive_light', how='right').rename({
    'light_id': 'light_id_pos',
    'light': 'light_pos'
}, axis=1)[['pair_id', 'light_id_pos', 'light_pos']]

In [10]:
df_neg = pd.merge(light_sequences[['light_id', 'light']].drop_duplicates(), 
                  pairs, 
                  left_on='light_id', right_on='negative_light', how='right').rename({
    'light_id': 'light_id_neg',
    'light': 'light_neg'
}, axis=1)[['pair_id', 'light_id_neg', 'light_neg']]

In [11]:
df = pd.merge(df_pos, df_neg)
df = test_df[['pair_id', 'heavy_id', 'heavy']].merge(df)
df.to_csv('{}/val.csv'.format(TEST_DIRECTORY))