In [1]:
import sys
import numpy as np
import pandas as pd
import re
import subprocess
from pathlib import Path
import fileinput
from biopandas.pdb import PandasPdb

In [2]:
# ----------------------------------------------------------Anarci--------------------------------------------------

def anarci(seq,scheme):
    out =subprocess.run(['anarci', '--sequence', seq, '--scheme', scheme, '--assign_germline'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    output = out.stdout.decode("utf-8").splitlines()
    stats = {}
    hd = []
    anarci_dict = {'chain':[], 'new_residue_number':[], 'new_insertion':[]}
    for line_i, line in enumerate(output):
        l = line.strip()
        li = line.split()
        if line.startswith('#'):
            hspl= line.split('|')
            hd.append(hspl)
        else:
            if len(li) == 3:
                if li[2] == '-':
                    continue
                else:
                    anarci_dict['chain'].append(li[0])
                    anarci_dict['new_residue_number'].append(int(li[1]))
                    anarci_dict['new_insertion'].append('')
                    residue = li[2]
                    
            elif len(li) == 4:
                anarci_dict['chain'].append(li[0])
                anarci_dict['new_residue_number'].append(int(li[1]))
                anarci_dict['new_insertion'].append(str(li[2]))
                residue = li[2]   

    stats = {'chain' : hd[5][2], 'start': hd[5][5], 'stop': hd[5][6], 'species' : hd[8][1], 
            'v_gene' : hd[8][2], 'v_id' : hd[8][3], 'j_gene' : hd[8][4], 'j_id' : hd[8][5]} 
    df_renum = pd.DataFrame.from_dict(anarci_dict)
    return(stats, df_renum)
    

In [3]:
def seq_order(df):
    from collections import OrderedDict
    df['residue_insertion'] = df['residue_number'].astype(str)+df['insertion'].astype(str)
    ordered_seq = list(OrderedDict.fromkeys(df['residue_insertion']))
    seq_dict = {ordered_seq[i]: i+1 for i in range(0, len(ordered_seq))}
    df['residue_insertion'] = df['residue_insertion'].map(seq_dict)
    df['residue_number'] = df['residue_insertion']
    df.drop(['residue_insertion'], axis=1, inplace = True)
    df['insertion'] = ''
    return(df)

In [4]:
def atom_renum(df):
    from collections import OrderedDict
    atom_list = list(OrderedDict.fromkeys(df['atom_number']))
    atom_dict = {atom_list[i]: i+1 for i in range(0, len(atom_list))}
    df['atom_number'] = df['atom_number'].map(atom_dict)
    return(df)

In [5]:
pdbfile="/media/hdd1/XGmAbBoost/5vkj_CD22_glycan/ino_model_101.pdb"
pdbfile2="/media/hdd1/XGmAbBoost/5vkj_CD22_glycan/5vkj_g.pdb"
scheme='aho' 
outfile='/media/hdd1/XGmAbBoost/5vkj_CD22_glycan/test.pdb'

In [6]:
ppdb = PandasPdb()
ppdb.read_pdb(pdbfile)
ppdb2 = PandasPdb()
ppdb2.read_pdb(pdbfile2)

<biopandas.pdb.pandas_pdb.PandasPdb at 0x7f1481ce81d0>

In [7]:
chains = ppdb.df['ATOM']['chain_id'].unique()
chain_num = len(chains)
print("Number of Fv chains:", chain_num)
if chain_num > 2:
    sys.exit("Ya ok this only works with one Fab or ScFv molecule")

Number of Fv chains: 2


In [8]:
chains_A = ppdb2.df['ATOM']['chain_id'].unique()
chain_num_A = len(chains_A)
print("Number of Antigen chains:", chain_num_A)
if chain_num_A > 1:
    sys.exit("Ya ok this only works with one Antigen chain")

Number of Antigen chains: 1


In [9]:
ppdb2.df['ATOM'].chain_id = 'A'
adf = ppdb2.df['ATOM']
tnum = len(adf)
adf.loc[tnum] = adf.loc[tnum-1]
adf.loc[tnum, 'record_name'] = 'TER'
adf.loc[tnum, 'atom_number'] = adf.loc[tnum, 'atom_number'] + 1
# adf.loc[tnum, 'atom_name'] = ''
# adf.loc[tnum, 'x_coord'] = '' 
# adf.loc[tnum, 'y_coord'] = ''
# adf.loc[tnum, 'z_coord'] = ''

In [10]:
adf

Unnamed: 0,record_name,atom_number,blank_1,atom_name,alt_loc,residue_name,blank_2,chain_id,residue_number,insertion,...,x_coord,y_coord,z_coord,occupancy,b_factor,blank_4,segment_id,element_symbol,charge,line_idx
0,ATOM,1,,N,,GLU,,A,17,,...,-56.149,36.816,20.228,1.0,63.67,,,N,,436
1,ATOM,2,,CA,,GLU,,A,17,,...,-55.841,36.844,18.803,1.0,60.60,,,C,,437
2,ATOM,3,,C,,GLU,,A,17,,...,-57.103,36.995,17.961,1.0,68.64,,,C,,438
3,ATOM,4,,O,,GLU,,A,17,,...,-57.889,36.057,17.823,1.0,69.93,,,O,,439
4,ATOM,5,,CB,,GLU,,A,17,,...,-55.086,35.576,18.394,1.0,69.55,,,C,,440
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2471,ATOM,2472,,CG,,GLN,,A,328,,...,28.746,-10.929,5.600,1.0,66.72,,,C,,2907
2472,ATOM,2473,,CD,,GLN,,A,328,,...,29.666,-10.324,6.643,1.0,68.69,,,C,,2908
2473,ATOM,2474,,OE1,,GLN,,A,328,,...,30.113,-11.006,7.565,1.0,72.76,,,O,,2909
2474,ATOM,2475,,NE2,,GLN,,A,328,,...,29.957,-9.036,6.498,1.0,64.09,,,N,,2910


In [11]:
adf = adf.append(ppdb2.df['HETATM']).reset_index(drop=True)

In [12]:
del ppdb.df['ANISOU']
del ppdb2.df['ANISOU']

In [13]:
def df_to_seq(chain):
    seq = ppdb.amino3to1()
    seq = str(''.join(seq.loc[seq['chain_id'] == chain, 'residue_name']))
    return(seq)

In [14]:
chain_info = []
df =ppdb.df['ATOM'].copy()
ppdb.df['ATOM'].drop(ppdb.df['ATOM'].index, inplace=True)
for chain in df['chain_id'].unique():
    cdf =  chain+'df'
    cdf = df[(df['chain_id'] == chain)].reset_index(drop=True)
    cdf = seq_order(cdf)
    ppdb.df['ATOM'] = ppdb.df['ATOM'].append(cdf).reset_index(drop=True)
    seq = df_to_seq(cdf['chain_id'].unique()[0])
    chain_info.append([chain, chain+'df', seq])

In [15]:
df =ppdb.df['ATOM'].copy()

In [16]:
for chain in chain_info:
    stats, num = anarci(chain[2], scheme)
    if stats['chain'] == 'H':
        print('Chain %s is a Heavy chain' % chain[0])
        df.loc[df.chain_id == chain[0], 'chain_id'] = 'H'
        hdf_stats = stats
        h_seq = chain[2]
        hdf_renum = num
        hdf = df[(df['chain_id'] == 'H')].reset_index(drop=True)
        hdf_renum.drop(['chain'], inplace = True, axis=1)
        hdf_renum['residue_number'] = np.arange(len(hdf_renum)) + 1
        hdf.drop(hdf[hdf['residue_number'] > int(hdf_stats['stop'])].index, inplace=True)
        hdf = hdf.merge(hdf_renum, how='left') \
        .drop(columns=['residue_number', 'insertion']) \
        .rename(columns={'new_residue_number': 'residue_number','new_insertion': 'insertion'})
    elif stats['chain'] == 'K':
        print('Chain %s is a Kappa Light chain' % chain[0])
        df.loc[df.chain_id == chain[0], 'chain_id'] = 'L'
        ldf_stats = stats
        l_seq = chain[2]
        ldf_renum = num
        ldf = df[(df['chain_id'] == 'L')].reset_index(drop=True)
        ldf_renum.drop(['chain'], inplace = True, axis=1)
        ldf_renum['residue_number'] = np.arange(len(ldf_renum)) + 1
        ldf.drop(ldf[ldf['residue_number'] > int(ldf_stats['stop'])].index, inplace=True)
        ldf = ldf.merge(ldf_renum, how='left') \
        .drop(columns=['residue_number', 'insertion']) \
        .rename(columns={'new_residue_number': 'residue_number','new_insertion': 'insertion'})
    elif stats['chain'] == 'L':
        print('Chain %s is a Lambda Light chain' % c_id)
        df.loc[df.chain_id == chain[0], 'chain_id'] = 'L'
        ldf_stats = stats
        l_seq = chain[2]
        ldf_renum = num
        ldf = df[(df['chain_id'] == 'L')].reset_index(drop=True)
        ldf_renum.drop(['chain'], inplace = True, axis=1)
        ldf_renum['residue_number'] = np.arange(len(ldf_renum)) + 1
        ldf.drop(ldf[ldf['residue_number'] > int(ldf_stats['stop'])].index, inplace=True)
        ldf = ldf.merge(ldf_renum, how='left') \
        .drop(columns=['residue_number', 'insertion']) \
        .rename(columns={'new_residue_number': 'residue_number','new_insertion': 'insertion'})
    else:
        sys.exit("Chain %s is not an antibody chain, sorry" % c_id)  

Chain H is a Heavy chain
Chain L is a Kappa Light chain


In [17]:
start = len(hdf) + len(ldf)
print(start, len(adf))
stop = start + len(adf)

3520 2537


In [18]:
remap = pd.DataFrame(np.arange(start+1,stop+1), columns=['atom_number'])
adf.atom_number = remap.atom_number

In [19]:
adf.sort_values(by=['atom_number']).reset_index(drop=True)

Unnamed: 0,record_name,atom_number,blank_1,atom_name,alt_loc,residue_name,blank_2,chain_id,residue_number,insertion,...,x_coord,y_coord,z_coord,occupancy,b_factor,blank_4,segment_id,element_symbol,charge,line_idx
0,ATOM,3521,,N,,GLU,,A,17,,...,-56.149,36.816,20.228,1.0,63.67,,,N,,436
1,ATOM,3522,,CA,,GLU,,A,17,,...,-55.841,36.844,18.803,1.0,60.60,,,C,,437
2,ATOM,3523,,C,,GLU,,A,17,,...,-57.103,36.995,17.961,1.0,68.64,,,C,,438
3,ATOM,3524,,O,,GLU,,A,17,,...,-57.889,36.057,17.823,1.0,69.93,,,O,,439
4,ATOM,3525,,CB,,GLU,,A,17,,...,-55.086,35.576,18.394,1.0,69.55,,,C,,440
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2532,HETATM,6053,,O2,,MAN,,A,405,,...,-23.773,55.112,11.239,1.0,82.83,,,O,,2968
2533,HETATM,6054,,O3,,MAN,,A,405,,...,-25.879,54.681,12.911,1.0,83.57,,,O,,2969
2534,HETATM,6055,,O4,,MAN,,A,405,,...,-28.010,55.016,11.052,1.0,71.25,,,O,,2970
2535,HETATM,6056,,O5,,MAN,,A,405,,...,-25.238,54.127,8.824,1.0,78.48,,,O,,2971


In [20]:
ppdb.df['ATOM'].drop(ppdb.df['ATOM'].index, inplace=True)
ppdb.df['OTHERS'].drop(ppdb.df['OTHERS'].index, inplace=True)
ppdb.df['HETATM'].drop(ppdb.df['HETATM'].index, inplace=True)
hdf = hdf.append(ldf, sort=False)
hdf = atom_renum(hdf)
hdf = hdf.append(adf, sort=False)
hdf.line_idx = np.arange(1, len(hdf)+1)
ppdb.df['ATOM'] = ppdb.df['ATOM'].append(hdf, sort=False)
# ppdb.df['OTHERS'].loc[len(ppdb.df['ATOM']), 'record_name'] = 'TER'
# ppdb.df['HETATM'] = ppdb.df['HETATM'].append(ppdb2.df['HETATM']).reset_index(drop=True)
# ppdb.df.reset_index(drop=True)

In [21]:
# ppdb.

In [22]:
ppdb.to_pdb(outfile, records=None)        

In [23]:
h_stats = pd.DataFrame.from_records([hdf_stats])
l_stats = pd.DataFrame.from_records([ldf_stats])