In [1]:
# Import general packages
import sys
import os
import re
import glob
import json
import gzip
from tqdm.auto import tqdm
from pathlib import PurePath

# Parallel processing
from joblib import Parallel, delayed

# import data and math packages
import numpy as np
import pandas as pd
from pandarallel import pandarallel

from abnumber import Chain
from abnumber.exceptions import ChainParseError, MultipleDomainsChainParseError

In [None]:
# Load initial covadab database
# Check sequences
# Calculate sequences of CDRs
# Write database with CDRs 

In [2]:
# I/O files location
datadir = '/nfs/baron1/nolde/zhalevsky/covidmap_v16' # path to I/O directory
dbfname = 'CoV-AbDab_130623.csv'                     # input file (https://opig.stats.ox.ac.uk/webapps/covabdab/static/downloads/CoV-AbDab_080224.csv)
outfname = 'covadab_allcdr.json'                     # output file

In [4]:
covadab = pd.read_csv(PurePath(datadir, dbfname), na_values='ND').dropna(subset=['VHorVHH', 'VL', 'CDRH3', 'CDRL3'])
covadab = covadab[covadab['Ab or Nb'] == 'Ab']
print(len(covadab))

11260


In [5]:
# Drop entries with sequence issues
bad_ids = []

for index, row in covadab.iterrows():
    if row['CDRH3'] not in row['VHorVHH'] or row['CDRL3'] not in row['VL']:
        bad_ids.append(index)
        
bad_records = covadab[covadab.index.isin(bad_ids)]
print('Total bad records:', len(bad_records))
covadab.drop(index=bad_ids, inplace=True)


Total bad records: 41


In [6]:
print(covadab.keys())

Index(['Name', 'Ab or Nb', 'Binds to', 'Doesn't Bind to', 'Neutralising Vs',
       'Not Neutralising Vs', 'Protein + Epitope', 'Origin', 'VHorVHH', 'VL',
       'Heavy V Gene', 'Heavy J Gene', 'Light V Gene', 'Light J Gene', 'CDRH3',
       'CDRL3', 'Structures', 'ABB Homology Model (if no structure)',
       'Sources', 'Date Added', 'Last Updated', 'Update Description',
       'Notes/Following Up?', 'Unnamed: 23', 'Unnamed: 24'],
      dtype='object')


In [7]:
def calc_cdrhl(row, hid, lid):
    try:
        if row[lid] in row[hid]: # single chain
            hchain = row[hid][:-len(row[lid])]
        else:
            hchain = row[hid]
            chainh = Chain(hchain, scheme='imgt', allowed_species='human', assign_germline=False)
            chainl = Chain(row[lid], scheme='imgt', allowed_species='human', assign_germline=False)
        return chainh.cdr1_seq,  chainh.cdr2_seq,  chainh.cdr3_seq, chainl.cdr1_seq,  chainl.cdr2_seq,  chainl.cdr3_seq
    except ChainParseError:
            print(row[hid], row[lid])
            return None

In [8]:
pandarallel.initialize(progress_bar=True)
tqdm.pandas()
covadab[['CDRH1', 'CDRH2', 'CDRH3', 'CDRL1', 'CDRL2', 'CDRL3']] = covadab.progress_apply(calc_cdrhl, axis=1, args=['VHorVHH', 'VL'], result_type='expand')

INFO: Pandarallel will run on 24 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


  0%|          | 0/11219 [00:00<?, ?it/s]

In [9]:
covadab.to_json(PurePath(datadir, outfname)) 