## Antibody numbering

Antibody numbering (see [here](https://www.naturalantibody.com/use-case/introduction-to-antibody-numbering/) for a brief description) was performed using [ANARCI](https://github.com/oxpig/ANARCI), according to [IMGT numbering scheme](https://www.imgt.org/IMGTScientificChart/Numbering/IMGTIGVLsuperfamily.html). 

Anarci was installed via conda. The `run_anarci` wrapper was used to ensure parallel computation and direct .fasta parsing. The search was restricted to human or mouse species only.

In [1]:
import anarci
import pandas as pd

In [2]:
humanSeqPath = 'data/human.fa'
mouseSeqPath = 'data/mouse.fa'

# the following lines will generate two .csv files containing the numbering

h_results = anarci.run_anarci(humanSeqPath,
            ncpu=8,                         # n of cores to be used
            scheme='imgt',                  # select IMGT as numbering scheme
            assign_germline=True,           # Use highest sequence identity
                                            # to assign the germline to the chain
            allowed_species=['human'],      # Limit the search to human species only
            output=True,
            csv=True,
            outfile='data/human_numbering'
            )

m_results = anarci.run_anarci(mouseSeqPath,
            ncpu=8,
            scheme='imgt',
            assign_germline=True,
            allowed_species=['mouse'],
            output=True,
            csv=True,
            outfile='data/mouse_numbering'
            )

In [15]:
from IPython.display import display
# inspect the CSV output of ANARCI

human_numbering = pd.read_csv('data/human_numbering_H.csv')
mouse_numbering = pd.read_csv('data/mouse_numbering_H.csv')

with pd.option_context('display.max_columns', 170):
    print('Human dataset:')
    display(human_numbering.head(15))
    print('')
    print('Mouse dataset:')
    display(mouse_numbering.head(15))

Human dataset:


Unnamed: 0,Id,domain_no,hmm_species,chain_type,e-value,score,seqstart_index,seqend_index,identity_species,v_gene,v_identity,j_gene,j_identity,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,67A,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,111A,111B,111C,111D,111E,111F,111G,111H,111I,112J,112I,112H,112G,112F,112E,112D,112C,112B,112A,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128
0,4,0,human,H,2.9e-56,180.2,0,129,human,IGHV3-9*01,0.98,IGHJ6*01,1.0,E,V,Q,L,V,E,S,G,G,-,G,L,V,Q,P,G,R,S,L,R,L,S,C,A,A,S,G,F,T,F,-,-,-,-,D,D,Y,A,M,H,W,V,R,Q,A,P,G,K,G,L,E,W,V,S,G,I,S,W,N,-,-,S,G,S,I,V,Y,-,A,D,S,V,K,-,G,R,F,T,S,S,R,D,N,A,K,N,S,L,Y,L,Q,M,N,S,L,R,A,E,D,T,A,L,Y,Y,C,A,K,D,Y,R,P,E,P,G,V,A,T,-,-,-,-,-,-,-,-,-,I,G,T,Y,Y,Y,Y,G,M,D,V,W,G,Q,G,T,T,V,T,V,S,S
1,5,0,human,H,8.2e-52,165.8,0,126,human,IGHV1-2*02,0.92,IGHJ6*01,1.0,Q,V,Q,L,L,Q,S,G,A,-,A,V,K,K,P,G,A,S,V,T,V,S,C,K,A,S,R,Y,T,F,-,-,-,-,T,G,S,Y,M,H,W,V,R,Q,A,P,G,Q,G,L,E,W,M,G,G,I,N,P,N,-,-,S,G,G,T,N,Y,-,A,Q,K,F,Q,-,G,R,V,T,M,T,R,D,T,S,I,S,T,A,Y,M,E,L,I,R,L,R,S,D,D,T,A,V,Y,Y,C,A,G,P,Y,Y,Y,G,S,G,R,-,-,-,-,-,-,-,-,-,-,-,-,L,P,H,Y,Y,S,G,M,D,V,W,G,Q,G,T,T,V,T,V,S,S
2,6,0,human,H,1.3e-53,171.7,0,128,human,IGHV3-49*03,0.95,IGHJ6*01,1.0,E,G,Q,L,V,E,S,G,G,-,G,W,V,Q,P,G,R,S,L,R,L,S,C,T,A,S,G,F,T,F,-,-,-,-,G,D,Y,A,M,S,W,F,R,Q,A,P,G,K,G,L,A,W,V,G,Y,N,R,S,K,A,Y,G,G,T,T,E,Y,-,A,A,S,V,K,-,G,R,F,T,I,S,R,D,D,S,K,S,I,A,Y,L,Q,M,N,S,L,K,T,E,D,T,A,V,Y,Y,C,T,R,G,G,G,S,Y,Y,G,E,-,-,-,-,-,-,-,-,-,-,-,-,D,Y,Y,Y,Y,Y,G,M,D,V,W,G,Q,G,T,T,V,T,V,S,S
3,7,0,human,H,1.2e-53,171.7,0,127,human,IGHV1-69*01,0.99,IGHJ6*01,1.0,Q,V,Q,L,V,Q,S,G,A,-,E,V,K,K,P,G,S,S,V,K,V,S,C,K,A,S,G,G,T,F,-,-,-,-,S,S,Y,A,I,S,W,V,G,Q,A,P,G,Q,G,L,E,W,M,G,G,I,I,P,I,-,-,F,G,T,A,N,Y,-,A,Q,K,F,Q,-,G,R,V,T,I,T,A,D,E,S,T,S,T,A,Y,M,E,L,S,S,L,R,S,E,D,T,A,V,Y,Y,C,A,R,V,P,G,E,G,Q,W,L,V,-,-,-,-,-,-,-,-,-,-,-,S,H,Y,Y,Y,Y,G,M,D,V,W,G,Q,G,T,T,V,T,V,S,S
4,8,0,human,H,5.6e-53,169.6,0,118,human,IGHV5-51*01,0.96,IGHJ5*02,0.93,E,A,R,L,V,Q,S,G,A,-,E,V,K,K,P,G,E,Y,L,K,I,S,C,K,G,S,G,Y,R,F,-,-,-,-,T,S,Y,W,I,G,W,V,R,Q,M,P,G,K,G,L,E,W,M,G,I,I,Y,P,G,-,-,D,S,D,T,R,Y,-,S,P,S,F,Q,-,G,Q,V,T,I,S,A,D,K,S,I,S,T,A,Y,L,Q,W,S,S,L,K,A,S,D,T,A,M,Y,Y,C,A,R,L,S,A,A,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,Y,S,W,L,D,P,W,G,Q,G,T,L,V,T,V,S,S
5,9,0,human,H,6.9e-49,156.4,0,128,human,IGHV2-5*02,0.97,IGHJ5*02,1.0,Q,I,T,L,K,E,S,G,P,-,T,L,V,K,P,T,Q,T,L,T,L,T,C,T,F,S,G,F,S,L,S,-,-,T,S,G,G,G,V,G,W,I,R,Q,P,P,G,K,A,L,E,W,L,A,L,I,Y,W,D,-,-,-,D,D,K,R,Y,-,S,P,S,L,K,-,R,R,L,T,I,T,K,D,T,S,K,N,Q,V,V,L,T,M,T,N,M,D,P,V,D,T,A,T,Y,Y,C,A,H,G,T,Y,C,S,S,T,S,C,-,-,-,-,-,-,-,-,-,-,-,Y,W,K,R,H,N,W,F,D,P,W,G,Q,G,T,L,V,T,V,S,S
6,10,0,human,H,5.6e-57,182.5,0,118,human,IGHV3-30*03,0.9,IGHJ4*01,0.93,Q,V,Q,L,V,D,F,G,G,-,G,V,F,Q,P,G,R,S,L,S,L,F,C,A,A,S,G,F,T,C,-,-,-,-,S,S,Y,G,M,H,W,V,R,Q,A,P,G,K,G,L,Q,W,V,A,V,I,R,Y,D,-,-,G,S,N,K,Y,Y,-,A,D,S,V,K,-,G,R,F,T,I,A,R,D,K,S,K,N,T,L,Y,L,Q,M,N,S,L,R,A,E,D,T,A,V,Y,Y,C,A,R,D,A,Y,S,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,M,T,M,Q,D,Y,W,G,Q,G,T,L,V,T,V,S,S
7,11,0,human,H,6.5e-56,179.1,0,117,human,IGHV5-51*01,0.95,IGHJ3*02,0.93,E,V,Q,L,V,Q,S,G,A,-,E,V,K,K,P,G,K,S,L,K,I,S,C,K,G,S,G,Y,S,F,-,-,-,-,T,T,Y,W,I,G,W,V,R,Q,M,P,G,K,G,L,E,W,L,G,I,I,Y,P,D,-,-,D,S,D,T,R,Y,-,S,P,S,F,R,-,G,Q,V,T,I,S,A,D,K,S,I,S,T,A,Y,L,Q,W,S,S,L,K,A,S,D,T,A,M,Y,Y,C,A,R,Q,R,G,P,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,F,A,F,N,I,W,G,Q,G,T,M,V,T,V,S,S
8,12,0,human,H,3.8000000000000004e-55,176.6,0,125,human,IGHV1-69*01,0.99,IGHJ6*01,1.0,Q,V,Q,L,V,Q,S,G,A,-,E,V,K,K,P,G,S,S,V,K,V,S,C,K,A,S,G,G,T,F,-,-,-,-,S,S,S,A,I,S,W,V,R,Q,A,P,G,Q,G,L,E,W,M,G,G,I,I,P,I,-,-,F,G,T,A,N,Y,-,A,Q,K,F,Q,-,G,R,V,T,I,T,A,D,E,S,T,S,T,A,Y,M,E,L,S,S,L,R,S,E,D,T,A,V,Y,Y,C,A,R,D,R,W,P,L,R,T,V,-,-,-,-,-,-,-,-,-,-,-,-,-,K,N,Y,Y,Y,G,M,D,V,W,G,Q,G,T,T,V,T,V,S,S
9,13,0,human,H,1.1e-56,181.6,0,123,human,IGHV1-46*01,0.93,IGHJ2*01,1.0,Q,V,Q,L,V,Q,S,G,A,-,E,V,K,K,P,G,A,S,V,M,V,S,C,K,A,S,G,Y,T,F,-,-,-,-,T,D,Y,H,I,H,W,V,R,Q,A,P,G,Q,G,L,E,W,M,G,I,I,N,P,S,-,-,G,G,G,T,I,Y,-,A,Q,K,F,Q,-,G,R,V,T,M,T,R,D,T,S,T,S,T,V,Y,M,E,L,S,S,L,R,S,E,D,T,A,V,Y,F,C,A,R,G,G,L,T,V,T,T,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,S,K,N,W,F,F,D,L,W,G,R,G,T,L,V,T,V,S,S



Mouse dataset:


Unnamed: 0,Id,domain_no,hmm_species,chain_type,e-value,score,seqstart_index,seqend_index,identity_species,v_gene,v_identity,j_gene,j_identity,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,111A,111B,111C,111D,111E,112E,112D,112C,112B,112A,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128
0,4,0,mouse,H,1.1e-49,158.9,0,115,mouse,IGHV5-15*01,0.98,IGHJ3*01,0.79,E,V,Q,R,V,E,S,G,G,-,G,L,V,Q,P,G,G,S,L,K,L,S,C,A,A,S,G,F,T,F,-,-,-,-,S,D,Y,G,M,A,W,V,R,Q,A,P,R,K,G,P,E,W,V,A,F,I,S,N,L,-,-,A,Y,S,I,Y,Y,A,D,T,V,T,-,G,R,F,T,I,S,R,E,N,A,K,N,T,L,Y,L,E,M,S,S,L,R,S,E,D,T,A,M,Y,Y,C,A,R,Q,G,D,-,-,-,-,-,-,-,-,-,-,-,-,-,-,G,Y,S,N,W,G,Q,G,T,L,V,T,V,S,A
1,5,0,mouse,H,8.4e-59,188.3,0,118,mouse,IGHV1-55*01,1.0,IGHJ2*01,0.86,Q,V,Q,L,Q,Q,P,G,A,-,E,L,V,K,P,G,A,S,V,K,M,S,C,K,A,S,G,Y,T,F,-,-,-,-,T,S,Y,W,I,T,W,V,K,Q,R,P,G,Q,G,L,E,W,I,G,D,I,Y,P,G,-,-,S,G,S,T,N,Y,N,E,K,F,K,-,S,K,A,T,L,T,V,D,T,S,S,S,T,A,Y,M,Q,L,S,S,L,T,S,E,D,S,A,V,Y,Y,C,A,R,K,G,S,V,-,-,-,-,-,-,-,-,-,-,-,Y,G,N,Y,V,Y,W,G,Q,G,T,T,L,T,V,S,S
2,6,0,mouse,H,6.5e-57,182.2,0,121,mouse,IGHV5-17*01,0.98,IGHJ2*01,1.0,E,V,K,L,M,E,S,G,G,-,G,L,V,K,P,G,G,S,L,K,L,S,C,A,A,S,G,F,T,F,-,-,-,-,S,D,Y,G,M,H,W,V,R,Q,A,P,E,K,G,L,E,W,V,A,Y,I,S,S,G,-,-,S,S,T,I,Y,Y,A,D,T,V,K,-,G,R,F,T,I,S,R,D,N,A,K,N,T,L,F,L,Q,M,T,S,L,R,S,E,D,T,A,M,Y,Y,C,A,R,D,Y,Y,Y,G,S,-,-,-,-,-,-,-,-,S,S,Y,Y,F,D,Y,W,G,Q,G,T,T,L,T,V,S,S
3,7,0,mouse,H,1.8e-51,164.6,0,111,mouse,IGHV2-3*01,0.97,IGHJ3*01,0.86,-,V,Q,L,Q,Q,S,G,P,-,G,L,V,A,P,S,Q,S,L,S,I,T,C,T,V,S,G,F,S,L,-,-,-,-,T,S,Y,G,V,S,W,V,R,Q,P,P,G,K,G,L,E,W,L,G,V,I,W,G,D,-,-,-,G,S,T,N,Y,H,S,A,L,I,-,S,R,L,S,I,S,K,D,N,S,K,S,Q,V,F,L,K,L,N,S,L,Q,T,D,D,T,A,T,Y,Y,C,A,K,H,L,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,H,S,Y,W,G,Q,G,T,L,V,T,V,S,A
4,8,0,mouse,H,3.3000000000000004e-60,192.8,0,120,mouse,IGHV1-72*01,1.0,IGHJ4*01,1.0,Q,V,Q,L,Q,Q,P,G,A,-,E,L,V,K,P,G,A,S,V,K,L,S,C,K,A,S,G,Y,T,F,-,-,-,-,T,S,Y,W,M,H,W,V,K,Q,R,P,G,R,G,L,E,W,I,G,R,I,D,P,N,-,-,S,G,G,T,K,Y,N,E,K,F,K,-,S,K,A,T,L,T,V,D,K,P,S,S,T,A,Y,M,Q,L,S,S,L,T,S,E,D,S,A,V,Y,Y,C,A,R,G,Y,G,N,Y,-,-,-,-,-,-,-,-,-,D,Y,Y,A,M,D,Y,W,G,Q,G,T,S,V,T,V,S,S
5,9,0,mouse,H,1.4e-58,187.6,0,116,mouse,IGHV1-72*01,0.97,IGHJ2*01,1.0,Q,V,Q,L,Q,Q,P,G,A,-,E,L,V,K,P,G,A,S,V,K,L,S,C,K,A,S,G,Y,T,F,-,-,-,-,T,S,Y,W,M,H,W,V,K,Q,R,P,G,R,G,L,A,W,I,G,R,M,D,P,N,-,-,S,G,G,N,K,Y,N,E,K,F,K,-,S,K,A,T,L,T,V,D,K,P,S,S,T,A,Y,M,Q,L,S,S,L,T,S,E,D,S,A,V,Y,Y,C,A,R,E,G,T,-,-,-,-,-,-,-,-,-,-,-,-,-,G,Y,F,D,Y,W,G,Q,G,T,T,L,T,V,S,S
6,10,0,mouse,H,6.800000000000001e-54,172.4,0,112,mouse,IGHV3-6*01,0.97,IGHJ4*01,1.0,D,V,H,L,V,E,S,G,P,-,G,L,V,K,P,S,Q,S,L,S,L,T,C,S,V,T,G,Y,S,I,T,-,-,-,S,G,Y,Y,W,N,W,I,R,Q,F,P,G,N,K,L,E,W,M,G,Y,I,S,Y,D,-,-,-,G,S,N,N,Y,N,P,S,L,K,-,N,R,I,S,I,T,R,D,T,S,K,N,Q,F,F,L,K,L,N,S,V,T,T,E,D,T,A,T,Y,Y,C,A,R,G,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,M,D,Y,W,G,Q,G,T,S,V,T,V,S,S
7,11,0,mouse,H,6.200000000000001e-60,191.9,0,117,mouse,IGHV1-53*01,0.97,IGHJ3*01,1.0,-,V,Q,L,Q,Q,P,G,T,-,E,L,V,K,P,G,A,S,V,K,L,S,C,K,A,S,G,Y,T,F,-,-,-,-,T,S,Y,W,M,H,W,V,Q,Q,R,P,G,Q,G,L,E,W,I,G,N,I,N,P,S,-,-,N,G,G,T,N,Y,N,E,K,F,K,-,S,Q,A,T,L,T,V,D,K,S,S,S,T,A,Y,M,Q,L,S,S,L,T,S,E,D,S,A,V,Y,Y,C,A,R,D,Y,G,N,-,-,-,-,-,-,-,-,-,-,-,Y,V,L,F,A,Y,W,G,Q,G,T,L,V,T,V,S,A
8,12,0,mouse,H,6.300000000000001e-62,198.4,0,118,mouse,IGHV1-64*01,1.0,IGHJ2*01,0.93,Q,V,Q,L,Q,Q,P,G,A,-,E,L,V,K,P,G,A,S,V,K,L,S,C,K,A,S,G,Y,T,F,-,-,-,-,T,S,Y,W,M,H,W,V,K,Q,R,P,G,Q,G,L,E,W,I,G,M,I,H,P,N,-,-,S,G,S,T,N,Y,N,E,K,F,K,-,S,K,A,T,L,T,V,D,K,S,S,S,T,A,Y,M,Q,L,S,S,L,T,S,E,D,S,A,V,Y,Y,C,A,R,D,T,T,V,-,-,-,-,-,-,-,-,-,-,-,E,N,Y,F,D,Y,W,G,Q,G,T,T,L,T,V,S,P
9,13,0,mouse,H,1.5e-54,174.5,0,111,mouse,IGHV5-9*01,0.98,IGHJ1*03,0.93,E,V,K,L,M,E,S,G,G,-,G,L,V,K,P,G,G,S,L,K,L,S,C,A,A,S,G,F,T,F,-,-,-,-,S,S,Y,T,M,S,W,V,R,Q,T,P,E,K,R,L,E,W,V,A,T,I,S,G,G,-,-,G,G,N,T,Y,Y,P,D,S,V,K,-,G,R,F,T,I,S,R,D,N,A,K,N,T,L,Y,L,Q,M,S,S,L,R,S,E,D,T,A,L,Y,Y,C,A,R,R,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,D,V,W,G,T,G,T,T,V,T,V,S,S


In [2]:
def numbering_to_fasta(csvPath, organism: str):
    """
    Extract sequences from a CSV file containing the IMGT numbering produced by ANARCI.
    """
    numberingDf = pd.read_csv(csvPath)

    logoDf = numberingDf.filter(regex='Id|\d+') # create a df with only Id and positions columns

    outFile = 'data/alignments/' + organism + '_hmm_align.fasta'

    with open(outFile, 'w') as f:

        for index, row in logoDf.iterrows():    # iterrows is slow, but still ok for 10k sequences
            id = row[0]
            seq = ''.join(list(row[1:]))
            f.write(f'>{id}\n{seq}\n')

In [3]:
numbering_to_fasta('data/human_numbering_H.csv', 'human')
numbering_to_fasta('data/mouse_numbering_H.csv', 'mouse')