# **Imports**

In [1]:
import pandas as pd
import os
from joblib import Parallel, delayed
import multiprocessing as mp
from sklearn.preprocessing import StandardScaler
import subprocess
import numpy as np

# **Definitions**

In [2]:
dic_hydro_KD = {
                'G': -0.4,
                'A': 1.8,
                'V': 4.2,
                'F': 2.8,
                'P': -1.6,
                'M': 1.9,
                'I': 4.5,
                'L': 3.8,
                'D': -3.5,
                'E': -3.5,
                'K': -3.9,
                'R': -4.5,
                'S': -0.8,
                'T': -0.7,
                'Y': -1.3,
                'H': -3.2,
                'C': 2.5,
                'N': -3.5,
                'Q': -3.5,
                'W': -0.9
}

dic_hydro_E = {
                'G': 0.48,
                'A': 0.62,
                'V': 1.08,
                'F': 1.19,
                'P': 0.12,
                'M': 0.64,
                'I': 1.38,
                'L': 1.06,
                'D': -0.9,
                'E': -0.74,
                'K': -1.5,
                'R': -2.53,
                'S': -0.18,
                'T': -0.05,
                'Y': 0.26,
                'H': -0.4,
                'C': 0.29,
                'N': -0.78,
                'Q': -0.85,
                'W': 0.81
}

dic_charge = {
                'G': 0,
                'A': 0,
                'V': 0,
                'F': 0,
                'P': 0,
                'M': 0,
                'I': 0,
                'L': 0,
                'D': -1,
                'E': -1,
                'K': 1,
                'R': 1,
                'S': 0,
                'T': 0,
                'Y': 0,
                'H': 0,
                'C': 0,
                'N': 0,
                'Q': 0,
                'W': 0, 
}

dic_groupe_lens = {'A':375, 'B':375, 'C':375, 'DP':374, 'DQ':378, 'DR':374}
dic_locus_lens = {'A':276, 'B':276, 'C':276, 'B2':99, 'DQA':186, 'DQB':192, 'DRA':182, 'DRB':192,
                   'DPA':183, 'DPB':191}

base_dir = os.getcwd()
pdb2fasta = f'{base_dir}/pdb2fasta.sh' 
surf_meth = 'RSASA'
hydro_scale = 'KD'

dir_data='/home/damaya/capsid_new/HLA-EpiCheck'

radius_patch = 15
thresh_surf = 20

list_avail_alleles = []
with open(f'{dir_data}/list_antigens.txt', 'r') as f:
    for line in f:
        list_avail_alleles.append(line.strip().split('/')[-2])

# **Table #0 (PatchDescription)**

This section computes the tables table_0* and track_resids_patchs_table_0* . The table table_0* contains the charges and hydrophobicity of the central AA of each patch. 
The table track_resids_patchs_table_0* contains the antigen and central AA of each patch.

In [3]:

table_0 = pd.DataFrame(columns=['patch_ID', 'charge_+', 'charge_-', 'hydro'])
patch_count = 0
track_resids_patchs = pd.DataFrame(columns=['patch_ID', 'antigen', 'central_AA'])

for antigen in list_avail_alleles:
    groupe = antigen.split('_')[0] if antigen.split('_')[0] in ['A', 'B', 'C'] else \
                antigen.split('_')[0][:2]
    dir_antigen = f'{dir_data}/{antigen}'
    os.system(f'{pdb2fasta} {dir_antigen}/PDBs/*-frame_0.pdb > {dir_antigen}/PDBs/seq_frame_0.txt')
    
    with open(f'{dir_antigen}/PDBs/seq_frame_0.txt') as file1:
        seq_fasta = []
        for line in file1:
            seq_fasta.append(line.strip())
            
    seq_antigen = seq_fasta[1] + seq_fasta[3]
    if len(seq_antigen) != dic_groupe_lens[groupe]:
        os.system(f'echo "Error at {antigen}, len seq_antigen = {len(seq_antigen)} and dic lens = {dic_groupe_lens[groupe]}" >> {base_dir}/log_error_lens')
    df_surf = pd.read_csv(f'{dir_antigen}/{surf_meth}_median.txt', \
                                       header=0, sep='\t', usecols=[1])
    df_surf['index'] = range(1, dic_groupe_lens[groupe]+1)
    df_surf.set_index('index', inplace=True)
    col_name = 'RSASA' if surf_meth == 'RSASA' else 'mean'
    surface_AA = df_surf.loc[df_surf[col_name] >= thresh_surf]
    
    for central_AA in surface_AA.index:
        charge = dic_charge[seq_antigen[central_AA-1]]
        charge_pos = charge if charge > 0 else 0
        charge_neg = charge if charge < 0 else 0
        hydro = dic_hydro_KD[seq_antigen[central_AA-1]]
        table_0.at[patch_count] = [patch_count, charge_pos, charge_neg, hydro]
        track_resids_patchs.at[patch_count] = [patch_count, antigen, central_AA]
        patch_count += 1
table_0.to_csv(f'{dir_data}/table_0_{surf_meth}_{thresh_surf}_hydro_{hydro_scale}_radius_{radius_patch}.csv')
track_resids_patchs.to_csv(f'{dir_data}/track_resids_patchs_table_0_{surf_meth}_{thresh_surf}_hydro_{hydro_scale}_radius_{radius_patch}.csv')


# **Table #1 (PatchFramesValues) (RSASA)**

This section computes the table table_1*. The table table_1* contains solvent-accessibility data of each patch in each frame considered. It means that patches are processed in a frame-wise way. 
For each patch, a set of features is calculated for the central AA of the patch and for the AAs belonging to the patch that are solvent-accessible.

In [4]:

def compute_table_1(index, col):
    max_sasa = pd.read_csv(f'{base_dir}/../data/Max_SASA_per_residue.tsv', header=None, sep='\t')
    frame_count = 0
    table_1 = pd.DataFrame(dtype='object')
    antigen = col['antigen']
    groupe = antigen.split('_')[0] if antigen.split('_')[0] in ['A', 'B', 'C'] else \
                antigen.split('_')[0][:2]
    dir_antigen = f'{dir_data}/{antigen}'
    
    with open(f'{dir_antigen}/PDBs/seq_frame_0.txt') as file1:
        seq_fasta = []
        for line in file1:
            seq_fasta.append(line.strip())
    
    seq_antigen = seq_fasta[1] + seq_fasta[3]
    central_AA = col['central_AA']
    patch_ID = col['patch_ID']
    res_name_central = seq_antigen[central_AA-1]
    list_patch_all = set()
    dic_frame_patch = {}
    
    with open(f'{dir_antigen}/patchs/patches_vmd_resid_{central_AA}_size_{radius_patch}.txt', 'r') \
            as file1:
        for line in file1:
            line = line.strip()
            list_patch_all = list_patch_all.union({int(i.strip()) for i in \
                                                   line.split(':')[0].split(' ')})
            frame_number = int(line.split(':')[1].split('_')[-1].split('.pdb')[0])
            dic_frame_patch[frame_number] = [int(i.strip()) for i in line.split(':')[0].split(' ')]
    
    list_patch_all.remove(central_AA)
    
    list_SASA_files = subprocess.check_output(f'ls {dir_antigen}/SASAs_out/SASA*txt', \
                                              shell=True)
    num_frames = len(list_SASA_files.decode('utf-8').split('\n')) - 1
    for frame_number in sorted(dic_frame_patch.keys()):
        if frame_number < num_frames:
            col_table_1 = pd.Series(name=frame_count, dtype='object')
            col_table_1['patch_ID'] = patch_ID
            col_table_1['frame_ID'] = frame_count
            SASA_file = pd.read_csv(f'{dir_antigen}/SASAs_out/SASA_{frame_number}.txt',
                                usecols=[0, 1],
                                names=[0,1],
                                header=None,
                                sep='\t',
                                low_memory=False,
                                dtype={
                                    0: 'int32',
                                    1: 'float64'
                                })

            sasa = SASA_file.loc[SASA_file[0] == central_AA-1, 1].iloc[0]
            col_table_1['RSASA_central_AA'] = (sasa / max_sasa.loc[ \
                                            max_sasa[0] == res_name_central].iloc[0, 3]) * 100
            
            for resid in list_patch_all:
                res_name = seq_antigen[resid-1]
                sasa = SASA_file.loc[SASA_file[0] == resid-1, 1].iloc[0]
                rsasa = (sasa / max_sasa.loc[max_sasa[0] == res_name].iloc[0, 3]) * 100
                col_table_1[f'RSASA_resid_{resid}'] = rsasa
                col_table_1[f'presence_resid_{resid}'] = 1 if resid in \
                                                        dic_frame_patch[frame_number] else 0
            
            table_1 = pd.concat([table_1, col_table_1], axis=1)
            frame_count += 1
    return table_1.T

jobs = mp.cpu_count()-2
list_param = []
for index, col in track_resids_patchs.iterrows():
    list_param.append((index, col))

outs_table_1 = Parallel(n_jobs=jobs)(delayed(compute_table_1)(index, col) for index, col in \
                                     list_param)


"\ntrack_frames_patchs = pd.DataFrame(columns=['frame_ID', 'patch_ID', 'frame_number'])\ntrack_frames_patchs[['frame_ID', 'patch_ID', 'frame_number']] = list(zip(range(len(table_1['frame_ID'])),                                                                 table_1['patch_ID'],                                                                 table_1['frame_ID']))\n\nfor col in track_frames_patchs.columns:\n    track_frames_patchs[col] = list(map(int, track_frames_patchs[col]))\n\ntrack_frames_patchs.to_csv(f'{base_dir}/prediction_epitopes/track_frames_patchs_table_1_{surf_meth}_{thresh_surf}_hydro_{hydro_scale}_radius_{radius_patch}.csv')\n"

In [5]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

lst_chunks = (x for x in chunks(outs_table_1, 5000))

print()




In [6]:
count=0
for df in lst_chunks:
    var = pd.concat(df, axis=0, ignore_index=True)
    f_table_1 = pd.HDFStore(f"{dir_data}/table_1_{surf_meth}_{thresh_surf}_hydro_{hydro_scale}_radius_{radius_patch}_{count}.hdf")
    f_table_1.append(f"table_1_{count}", var)
    f_table_1.close()
    count += 1
nb_files = count+1
del lst_chunks, var, outs_table_1

In [8]:
list_df = (pd.read_hdf(f'{dir_data}/table_1_{surf_meth}_{thresh_surf}_hydro_{hydro_scale}_radius_{radius_patch}_{i}.hdf', f'table_1_{i}') for i in range(nb_files))

# **Table #2 (PatchAggValuesPerResidue)**

The tables table_0* and table_1* are concatenated. Aggregation is performed with respect to the frames.

In [None]:
dic_N_RMSF_all = {}
data = []

for antigen in list_avail_alleles:
    groupe = antigen.split('_')[0] if antigen.split('_')[0] in ['A', 'B', 'C'] else \
                antigen.split('_')[0][:2]
    
    if groupe in ['A', 'B', 'C'] :
        len_chain_A, len_chain_B = (dic_locus_lens['A'], dic_locus_lens['B2'])
    elif groupe == 'DP':
        len_chain_A, len_chain_B = (dic_locus_lens['DPA'], dic_locus_lens['DPB'])
    elif groupe == 'DQ':
        len_chain_A, len_chain_B = (dic_locus_lens['DQA'], dic_locus_lens['DQB'])
    elif groupe == 'DR':
        len_chain_A, len_chain_B = (dic_locus_lens['DRA'], dic_locus_lens['DRB'])
        
    dir_antigen = f'{dir_data}/{antigen}'
    df_RMSF = pd.read_csv(f'{dir_antigen}/{antigen}_RMSF.dat', index_col=0, names=['RMSF'], \
                          sep='\t')
    
    ix = []
    for value in df_RMSF.index:
        if (value > 5 and value < len_chain_A-4) or (value > len_chain_A+5 and \
                                                     value < len_chain_A+len_chain_B-4):
            ix.append(value)
    
    df2 = df_RMSF.loc[ix]
    df_RSASA = pd.read_csv(f'{dir_antigen}/{surf_meth}_median.txt', header=0, sep='\t', usecols=[1],\
                          names=['mean'])
    df_RSASA.loc[:,'index'] = range(1,len_chain_A+len_chain_B+1)
    df_RSASA.set_index('index', inplace=True)
    df_RSASA2 = df_RSASA.loc[ix,:]
    data = data + list(df2.loc[df_RSASA2['mean'] >= thresh_surf].values)    
    
std_scaler = StandardScaler()
std_scaler.fit(data)
    
for antigen in list_avail_alleles:
    groupe = antigen.split('_')[0] if antigen.split('_')[0] in ['A', 'B', 'C'] else \
                antigen.split('_')[0][:2]
    
    if groupe in ['A', 'B', 'C'] :
        len_chain_A, len_chain_B = (dic_locus_lens['A'], dic_locus_lens['B2'])
    elif groupe == 'DP':
        len_chain_A, len_chain_B = (dic_locus_lens['DPA'], dic_locus_lens['DPB'])
    elif groupe == 'DQ':
        len_chain_A, len_chain_B = (dic_locus_lens['DQA'], dic_locus_lens['DQB'])
    elif groupe == 'DR':
        len_chain_A, len_chain_B = (dic_locus_lens['DRA'], dic_locus_lens['DRB'])
        
    dir_antigen = f'{dir_data}/{antigen}'
    df_RMSF = pd.read_csv(f'{dir_antigen}/{antigen}_RMSF.dat', index_col=0, names=['RMSF'], \
                          sep='\t')
    df_RSASA = pd.read_csv(f'{dir_antigen}/{surf_meth}_median.txt', header=0, sep='\t', usecols=[1],\
                          names=['mean'])
    df_RSASA.loc[:,'index'] = range(1,len_chain_A+len_chain_B+1)
    df_RSASA.set_index('index', inplace=True)
    df2 = df_RMSF.loc[df_RSASA['mean'] >= thresh_surf]
    data_std = std_scaler.transform(df2.values.reshape(-1, 1)).reshape(1,-1)[0]
    df_std = pd.DataFrame(data_std, columns=['N-RMSF'], index=df2.index)
    dic_N_RMSF_all[antigen] = df_std

del std_scaler

def compute_table_2(index, col):
    patch_ID = col['patch_ID']
    central_AA = col['central_AA']
    antigen = col['antigen']
    groupe = antigen.split('_')[0] if antigen.split('_')[0] in ['A', 'B', 'C'] else \
            antigen.split('_')[0][:2]

    if groupe in ['A', 'B', 'C'] :
        len_chain_A, len_chain_B = (dic_locus_lens['A'], dic_locus_lens['B2'])
    elif groupe == 'DP':
        len_chain_A, len_chain_B = (dic_locus_lens['DPA'], dic_locus_lens['DPB'])
    elif groupe == 'DQ':
        len_chain_A, len_chain_B = (dic_locus_lens['DQA'], dic_locus_lens['DQB'])
    elif groupe == 'DR':
        len_chain_A, len_chain_B = (dic_locus_lens['DRA'], dic_locus_lens['DRB'])

    dir_antigen = f'{dir_data}/{antigen}'

    col_table_2 = pd.Series(name=patch_ID, dtype='object')
    col_table_2['patch_ID'] = int(patch_ID)
    col_table_2['N_RMSF_central'] = dic_N_RMSF_all[antigen].at[central_AA, 'N-RMSF']
    col_table_2['RSASA_avg_central'] = table_1.loc[table_1['patch_ID'] == \
                                                    patch_ID]['RSASA_central_AA'].median()
    col_table_2['RSASA_min_central'] = table_1.loc[table_1['patch_ID'] == \
                                                    patch_ID]['RSASA_central_AA'].min()
    col_table_2['RSASA_max_central'] = table_1.loc[table_1['patch_ID'] == \
                                                    patch_ID]['RSASA_central_AA'].max()
    col_table_2['charge_+_central'] = table_0.loc[table_0['patch_ID'] == \
                                                   patch_ID]['charge_+'].iloc[0]
    col_table_2['charge_-_central'] = table_0.loc[table_0['patch_ID'] == \
                                                   patch_ID]['charge_-'].iloc[0]
    col_table_2['hydro_central'] = table_0.loc[table_0['patch_ID'] == \
                                                   patch_ID]['hydro'].iloc[0]
    list_col_names = table_1.loc[(table_1['patch_ID'] == patch_ID) & \
                              (table_1['frame_ID'] == 0)].T.dropna().index
    col_resids = {int(name.split('_')[-1]) for name in list_col_names if 'resid' in name}
    df_RSASA = pd.read_csv(f'{dir_antigen}/{surf_meth}_median.txt', header=0, sep='\t',\
                           usecols=[1], names=['mean'])
    df_RSASA.loc[:,'index'] = range(1,len_chain_A+len_chain_B+1)
    df_RSASA.set_index('index', inplace=True)
    surf_resids = df_RSASA.loc[df_RSASA['mean'] >= thresh_surf].index
    list_resid = col_resids.intersection(set(surf_resids))

    for resid in list_resid:
        col_table_2[f'N_RMSF_resid_{resid}'] = dic_N_RMSF_all[antigen].at[resid, 'N-RMSF']
        patch_resid = track_resids_patchs.loc[(track_resids_patchs['antigen'] == antigen) & \
                                                (track_resids_patchs['central_AA'] == resid), \
                                                'patch_ID'].iloc[0]
        col_table_2[f'RSASA_avg_resid_{resid}'] = table_1.loc[table_1['patch_ID'] == \
                                                    patch_ID][f'RSASA_resid_{resid}'].median()
               
        col_table_2[f'RSASA_min_resid_{resid}'] = table_1.loc[table_1['patch_ID'] == \
                                                    patch_ID][f'RSASA_resid_{resid}'].min()
        col_table_2[f'RSASA_max_resid_{resid}'] = table_1.loc[table_1['patch_ID'] == \
                                                    patch_ID][f'RSASA_resid_{resid}'].max()
        col_table_2[f'charge_+_resid_{resid}'] = table_0.loc[table_0['patch_ID'] == \
                                                    patch_resid]['charge_+'].iloc[0]
        col_table_2[f'charge_-_resid_{resid}'] = table_0.loc[table_0['patch_ID'] == \
                                                    patch_resid]['charge_-'].iloc[0]
        col_table_2[f'hydro_resid_{resid}'] = table_0.loc[table_0['patch_ID'] == \
                                                    patch_resid]['hydro'].iloc[0]
        col_table_2[f'freq_resid_{resid}'] = (table_1.loc[table_1['patch_ID'] == \
                                                    patch_ID][ \
                                                    f'presence_resid_{resid}'].sum()) / \
                                                (table_1.loc[table_1['patch_ID'] == \
                                                    patch_ID][ \
                                                    f'presence_resid_{resid}'].shape[0])
    return col_table_2

track_resids_patchs = pd.read_csv(f'{dir_data}/track_resids_patchs_table_0_{surf_meth}_{thresh_surf}_hydro_{hydro_scale}_radius_{radius_patch}.csv', sep=',', index_col=0)
table_0 = pd.read_csv(f'{dir_data}/table_0_{surf_meth}_{thresh_surf}_hydro_{hydro_scale}_radius_{radius_patch}.csv', sep=',', index_col=0)
jobs = mp.cpu_count()-2
table_2 = pd.DataFrame()

for df in list_df:
    list_param = []
    table_1 = df
    df_chunk = track_resids_patchs.loc[track_resids_patchs['patch_ID'].isin(\
                                                            df['patch_ID'].unique())]

    for index, col in df_chunk.iterrows():
        list_param.append((index, col))

    outs_table_2 = Parallel(n_jobs=jobs)(delayed(compute_table_2)(index, col) for index, col in \
                                         list_param)
    for obj in outs_table_2:
        table_2 = pd.concat([table_2, pd.DataFrame(obj).T], axis=0, ignore_index=True)
    
    del table_1, df
    
del list_df

In [None]:
table_2.to_csv(f'{dir_data}/table_2_{surf_meth}_{thresh_surf}_hydro_{hydro_scale}_radius_{radius_patch}.csv')


In [None]:
track_resids_patchs = pd.read_csv(f'{dir_data}/track_resids_patchs_table_0_{surf_meth}_{thresh_surf}_hydro_{hydro_scale}_radius_{radius_patch}.csv', sep=',', index_col=0)
table_2 = pd.read_csv(f'{dir_data}/table_2_{surf_meth}_{thresh_surf}_hydro_{hydro_scale}_radius_{radius_patch}.csv', sep=',', index_col=0)


In [None]:
list_tuples = []
for col_name in table_2.columns:
    if 'RSASA_avg_resid' in col_name:
        for i in table_2[col_name].dropna().index:
            if table_2.at[i, col_name] <= 20:
                patch_id = int(table_2.at[i, 'patch_ID'])
                antigen = track_resids_patchs[track_resids_patchs['patch_ID'] == patch_id]['antigen'].iloc[0]
                central_AA = track_resids_patchs.loc[track_resids_patchs['patch_ID'] == patch_id, 'central_AA'].iloc[0]
                print(patch_id, col_name, table_2.at[i, col_name], antigen, central_AA)
                list_tuples.append((patch_id, col_name, table_2.at[i, col_name], antigen, central_AA))

var = {x[3] for x in list_tuples}

# **Table #3 (PatchAggValues)**

Aggregation is performed on table_2* with respect to the AAs in the patches.

In [None]:
df_all_eplets = pd.read_csv(f'{base_dir}/../data/lists_All_confirmed_eplets.csv', index_col=0)
df_all_no_conf_eplets = pd.read_csv(f'{base_dir}/../data/lists_All_non_confirmed_eplets.csv', index_col=0)
df_eplets_ext = pd.DataFrame(columns=['list_resids'])
df_no_conf_eplets_ext = pd.DataFrame(columns=['list_resids'])
track_resids_patchs = pd.read_csv(f'{dir_data}/track_resids_patchs_table_0_{surf_meth}_{thresh_surf}_hydro_{hydro_scale}_radius_{radius_patch}.csv', sep=',', index_col=0)

for allele in list_avail_alleles:
    if allele.split('_')[0] in ['DP', 'DQ']:
        allele = allele.strip()
        locus_A = allele.split('_')[0] if allele.split('_')[0] in ['A', 'B', 'C'] else allele.split('_')[0][:2]+'A'
        locus_B = 'B2' if allele.split('_')[0] in ['A', 'B', 'C'] else allele.split('_')[0][:2]+'B'
                
        name_A = allele.split('_')[0]+'A1*'+allele.split('-')[0][3:5]+':'+allele.split('-')[0][5:7]
        name_B = allele.split('_')[0]+'B1*'+allele.split('-')[1][0:2]+':'+allele.split('-')[1][2:4]
        eplets_A = df_all_eplets[df_all_eplets['allele_name'] == name_A]
        eplets_B = df_all_eplets[df_all_eplets['allele_name'] == name_B]
        df_eplets_ext.loc[name_A, 'list_resids'] = set()
        for index, row in eplets_A.iterrows():
            list_conf_eplets = list(filter(None, eplets_A.loc[index, 'eplet_resid_nums'].split(' ')))
            df_eplets_ext.loc[name_A, 'list_resids'] = df_eplets_ext.loc[name_A, 'list_resids'].union(set(map(int, list_conf_eplets)))
        
        df_eplets_ext.loc[name_B, 'list_resids'] = set()
        for index, row in eplets_B.iterrows():
            list_conf_eplets = list(filter(None, eplets_B.loc[index, 'eplet_resid_nums'].split(' ')))
            df_eplets_ext.loc[name_B, 'list_resids'] = df_eplets_ext.loc[name_B, 'list_resids'].union(set(map(int, list_conf_eplets)))
        
        no_conf_eplets_A = df_all_no_conf_eplets[df_all_no_conf_eplets['allele_name'] == name_A]
        no_conf_eplets_B = df_all_no_conf_eplets[df_all_no_conf_eplets['allele_name'] == name_B]
        
        df_no_conf_eplets_ext.loc[name_A, 'list_resids'] = set()
        for index, row in no_conf_eplets_A.iterrows():
            list_no_conf_eplets = list(filter(None, no_conf_eplets_A.loc[index, 'eplet_resid_nums'].split(' ')))
            df_no_conf_eplets_ext.loc[name_A, 'list_resids'] = df_no_conf_eplets_ext.loc[name_A, 'list_resids'].union(set(map(int, list_no_conf_eplets)))
            
        df_no_conf_eplets_ext.loc[name_B, 'list_resids'] = set()
        for index, row in no_conf_eplets_B.iterrows():
            list_no_conf_eplets = list(filter(None, no_conf_eplets_B.loc[index, 'eplet_resid_nums'].split(' ')))
            df_no_conf_eplets_ext.loc[name_B, 'list_resids'] = df_no_conf_eplets_ext.loc[name_B, 'list_resids'].union(set(map(int, list_no_conf_eplets)))

    else:
        allele = allele.strip()
        locus_A = allele.split('_')[0] if allele.split('_')[0] in ['A', 'B', 'C'] else allele.split('_')[0][:2]+'A'
        locus_B = 'B2' if allele.split('_')[0] in ['A', 'B', 'C'] else allele.split('_')[0][:2]+'B'
                
        name_A = allele.split('_')[0]+'*'+allele.split('_')[1][0:2]+':'+allele.split('_')[1][2:4]
        eplets_A = df_all_eplets[df_all_eplets['allele_name'] == name_A]
        df_eplets_ext.loc[name_A, 'list_resids'] = set()
        for index, row in eplets_A.iterrows():
            df_eplets_ext.loc[name_A, 'list_resids'] = df_eplets_ext.loc[name_A, 'list_resids'].union(set(map(int, eplets_A.loc[index, 'eplet_resid_nums'].split(' '))))
        
        no_conf_eplets_A = df_all_no_conf_eplets[df_all_no_conf_eplets['allele_name'] == name_A]
        df_no_conf_eplets_ext.loc[name_A, 'list_resids'] = set()
        
        for index, row in no_conf_eplets_A.iterrows():
            list_no_conf_eplets = list(filter(None, no_conf_eplets_A.loc[index, 'eplet_resid_nums'].split(' ')))
            df_no_conf_eplets_ext.loc[name_A, 'list_resids'] = df_no_conf_eplets_ext.loc[name_A, 'list_resids'].union(set(map(int, list_no_conf_eplets)))

def compute_table_3(index, col):
    col_table_3 = pd.Series(name=col['patch_ID'], dtype='object')
    col_table_3['patch_ID'] = col['patch_ID']
    col_table_3['N_RMSF_central'] = col['N_RMSF_central']
    resids_patch = {int(name.split('_')[-1]) for name in col.index if 'N_RMSF_resid' in name}
    col_table_3['N_RMSF_patch_min'] = min([col[f'N_RMSF_resid_{resid}'] for resid in resids_patch] \
                                          + [col_table_3['N_RMSF_central']])
    col_table_3['N_RMSF_patch_max'] = max([col[f'N_RMSF_resid_{resid}'] for resid in resids_patch] \
                                          + [col_table_3['N_RMSF_central']])
    col_table_3['N_RMSF_patch_avg'] = np.average([col[f'N_RMSF_resid_{resid}'] \
                                                  for resid in resids_patch] \
                                                 + [col_table_3['N_RMSF_central']])
    col_table_3['N_RMSF_patch_avg_freq'] = np.average([col[f'N_RMSF_resid_{resid}']*\
                                                       col[f'freq_resid_{resid}'] \
                                                     for resid in resids_patch] \
                                                      + [col_table_3['N_RMSF_central']])
    col_table_3['RSASA_min_central'] = col['RSASA_min_central']
    col_table_3['RSASA_max_central'] = col['RSASA_max_central']
    col_table_3['RSASA_avg_central'] = col['RSASA_avg_central']
    col_table_3['charge_pos_central'] = col['charge_+_central']
    col_table_3['charge_neg_central'] = col['charge_-_central']
    col_table_3['hydro_central'] = col['hydro_central']
    col_table_3['charge_patch_pos_freq'] = sum([col[f'charge_+_resid_{resid}']*\
                                                col[f'freq_resid_{resid}'] \
                                               for resid in resids_patch] \
                                                  + [col_table_3['charge_pos_central']])
    col_table_3['charge_patch_neg_freq '] = sum([col[f'charge_-_resid_{resid}']*\
                                                 col[f'freq_resid_{resid}'] \
                                               for resid in resids_patch] \
                                                    + [col_table_3['charge_neg_central']])
    col_table_3['charge_patch_pos'] = sum([col[f'charge_+_resid_{resid}'] \
                                           for resid in resids_patch] \
                                              + [col_table_3['charge_pos_central']])
    col_table_3['charge_patch_neg'] = sum([col[f'charge_-_resid_{resid}'] \
                                           for resid in resids_patch] \
                                          + [col_table_3['charge_neg_central']])
    col_table_3['RSASA_patch_min_avg_freq']= np.average([col[f'RSASA_min_resid_{resid}']*\
                                                         col[f'freq_resid_{resid}'] \
                                                        for resid in resids_patch] \
                                                        + [col_table_3['RSASA_min_central']])
    col_table_3['RSASA_patch_max_avg_freq'] = np.average([col[f'RSASA_max_resid_{resid}']*\
                                                          col[f'freq_resid_{resid}'] \
                                                      for resid in resids_patch] \
                                                         + [col_table_3['RSASA_max_central']])
    col_table_3['RSASA_patch_avg_avg_freq'] = np.average([col[f'RSASA_avg_resid_{resid}']*\
                                                          col[f'freq_resid_{resid}'] \
                                                      for resid in resids_patch] \
                                                         + [col_table_3['RSASA_avg_central']])
    col_table_3['RSASA_patch_min_avg'] = np.average([col[f'RSASA_min_resid_{resid}'] \
                                                     for resid in resids_patch] \
                                                      + [col_table_3['RSASA_min_central']])
    col_table_3['RSASA_patch_max_avg'] = np.average([col[f'RSASA_max_resid_{resid}'] \
                                                     for resid in resids_patch] \
                                                      + [col_table_3['RSASA_max_central']])
    col_table_3['RSASA_patch_avg_avg'] = np.average([col[f'RSASA_avg_resid_{resid}'] \
                                                     for resid in resids_patch] \
                                                      + [col_table_3['RSASA_avg_central']])
    col_table_3['hydro_patch_min'] = min([col[f'hydro_resid_{resid}'] for resid in resids_patch] \
                                         + [col_table_3['hydro_central']])
    col_table_3['hydro_patch_max'] = max([col[f'hydro_resid_{resid}'] for resid in resids_patch] \
                                         + [col_table_3['hydro_central']])
    col_table_3['hydro_patch_avg'] = np.average([col[f'hydro_resid_{resid}'] \
                                                 for resid in resids_patch] \
                                                    + [col_table_3['hydro_central']])
    col_table_3['hydro_patch_avg_freq'] = np.average([col[f'hydro_resid_{resid}']*\
                                                      col[f'freq_resid_{resid}']\
                                                       for resid in resids_patch] \
                                                         + [col_table_3['hydro_central']])
    
    allele = track_resids_patchs[track_resids_patchs['patch_ID'] == \
                                  col['patch_ID']]['antigen'].iloc[0]
    central_AA = track_resids_patchs[track_resids_patchs['patch_ID'] == \
                                  col['patch_ID']]['central_AA'].iloc[0]
    if allele.split('_')[0] in ['DP', 'DQ']:
        allele = allele.strip()
        name_A = allele.split('_')[0]+'A1*'+allele.split('-')[0][3:5]+':'+allele.split('-')[0][5:7]
        name_B = allele.split('_')[0]+'B1*'+allele.split('-')[1][0:2]+':'+allele.split('-')[1][2:4]
        eplets_A_ext = df_eplets_ext.loc[name_A, 'list_resids']
        eplets_B_ext = df_eplets_ext.loc[name_B, 'list_resids']
        eplets_B_ext = set(map(lambda x:x+dic_locus_lens[allele.split('_')[0]+'A'], eplets_B_ext))
        eplets_allele = eplets_A_ext.union(eplets_B_ext)
        
    if allele.split('_')[0] in ['A', 'B', 'C']:
        allele = allele.strip()
        name_A = allele.split('_')[0]+'*'+allele.split('_')[1][0:2]+':'+allele.split('_')[1][2:4]
        eplets_A_ext = df_eplets_ext.loc[name_A, 'list_resids']
        eplets_allele = eplets_A_ext
        
    if allele.split('_')[0][:2] in 'DR':
        allele = allele.strip()
        name_B = allele.split('_')[0]+'*'+allele.split('_')[1][0:2]+':'+allele.split('_')[1][2:4]
        eplets_B_ext = df_eplets_ext.loc[name_B, 'list_resids']
        eplets_B_ext = set(map(lambda x:x+dic_locus_lens['DRA'], eplets_B_ext))
        eplets_allele = eplets_B_ext
        
    col_table_3['class'] = 1 if central_AA in eplets_allele else 0
    
    return col_table_3
    
table_2 = pd.read_csv(f'{dir_data}/table_2_{surf_meth}_{thresh_surf}_hydro_{hydro_scale}_radius_{radius_patch}.csv', \
                      sep=',', index_col=0)
jobs = mp.cpu_count()-2

list_param = []
for index, col in table_2.iterrows():
    list_param.append((index, col.dropna()))

outs_table_3 = Parallel(n_jobs=jobs)(delayed(compute_table_3)(index, col) for index, col in \
                                         list_param)


In [None]:
table_3 = pd.DataFrame()
for obj in outs_table_3:
    table_3 = pd.concat([table_3, pd.DataFrame(obj).T], axis=0, ignore_index=True)

table_3['patch_ID'] = list(map(int, table_3['patch_ID']))
table_3['class'] = list(map(int, table_3['class']))
table_3.to_csv(f'{dir_data}/table_3_{surf_meth}_{thresh_surf}_hydro_{hydro_scale}_radius_{radius_patch}.csv')
