In [27]:
import pandas as pd
from functools import reduce


In [2]:
def load_structural_data(tcr3d_path: str,
                         af_score3_path: str,
                         af_score2_path: str,
                         id10x:list = []):
    map_cols = {
        'TCR_ID': 'id',
        'TRA_ref': 'TRA',
        'TRB_ref': 'TRB',
        'MHCseq_ref': 'MHCseq',
        'assigned_allele': 'mhc_allele',
        'peptide': 'epitope'
    }

    # Load Experimental Data
    tcr3d_data = pd.read_csv(tcr3d_path, index_col=0)
    tcr3d_data.drop(['TRA', 'TRB', 'MHCseq'], axis=1, inplace=True)
    tcr3d_data.rename(columns=map_cols, inplace=True)
    tcr3d_data.reset_index(drop=True, inplace=True)

    # Load AF score 3 Data
    af_score3_data = pd.read_csv(af_score3_path)
    af_score3_data.rename(columns=map_cols, inplace=True)
    af_score3_data.reset_index(drop=True, inplace=True)
    
    # Load AF score 2 Data
    af_score2_data = pd.read_csv(af_score2_path)
    af_score2_data.rename(columns=map_cols, inplace=True)
    af_score2_data.reset_index(drop=True, inplace=True)

    af_data = pd.concat([af_score3_data, af_score2_data], ignore_index=True)
    af_data = af_data[af_data['filepath_a'].notna() & af_data['filepath_b'].notna()]
    af_data.reset_index(drop=True, inplace=True)

    if len(id10x) > 0:
        af_data = af_data[~af_data['Reference'].isin(id10x)]

    print(f'Final AlphaFold training data size:{af_data.shape}')

    # remove overlapping epitopes
    tcr3d_data = tcr3d_data[~tcr3d_data['epitope'].isin(af_data['epitope'])]

    print(f'TCR3D training data size after removing overlapping epitopes: {tcr3d_data.shape}')

    train_data = pd.concat([tcr3d_data, af_data], ignore_index=True)
    train_data = train_data.dropna(subset=['filepath_a', 'filepath_b'], how='any')

    print(f'Total training data size: {train_data.shape}')

    return train_data

In [3]:
tcr3d_path = "/home/samuel.assis/MatchImm/MatchImmNet/data/02-processed/tcr3d_20251004_renamed.csv"

af_score3_path = "/home/samuel.assis/MatchImm/MatchImmNet/data/01-raw/AF_vdjdb_score3_20251212.csv"
af_score2_1217_path = "/home/samuel.assis/MatchImm/MatchImmNet/data/01-raw/AF_vdjdb_score2_wojust10x_20251217.csv"
af_score2_0105_path = "/home/samuel.assis/MatchImm/MatchImmNet/data/01-raw/AF_vdjdb_score2_wojust10x_20260105.csv"


id10x = ['34793243', '30418433', '35383307', '37872153', '32081129','30451992', '34811538']

df1217 = load_structural_data(tcr3d_path, af_score3_path, af_score2_1217_path)
print("-----")
df0105 = load_structural_data(tcr3d_path, af_score3_path, af_score2_0105_path)
print("-----")
df0105_id10x = load_structural_data(tcr3d_path, af_score3_path, af_score2_0105_path, id10x=id10x)

Final AlphaFold training data size:(1473, 24)
TCR3D training data size after removing overlapping epitopes: (209, 18)
Total training data size: (1682, 24)
-----
Final AlphaFold training data size:(2829, 24)
TCR3D training data size after removing overlapping epitopes: (104, 18)
Total training data size: (2933, 24)
-----
Final AlphaFold training data size:(2014, 24)
TCR3D training data size after removing overlapping epitopes: (105, 18)
Total training data size: (2119, 24)


In [39]:
def diff_summary(dataframes, column, names):

    dataframes_counts = [df.value_counts(column).reset_index() for df in dataframes]


    for i, df in enumerate(dataframes_counts):
        df.rename(columns={'count': f"count_{names[i]}"}, inplace=True)

    df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['epitope'],
                                            how='outer'), dataframes_counts)
    
    #diff counts
    for i in range(len(dataframes_counts)-1):
        df_merged[f'diff_{names[i]}_{names[i+1]}'] = df_merged[f'count_{names[i]}'] - df_merged[f'count_{names[i+1]}']
    
    df_merged.fillna(0, inplace=True)
    df_merged.sort_values(by=[f'count_{names[0]}'], ascending=False, inplace=True)

    #Number of equal and diff epitopes
    df_merged['equal'] = df_merged[[f'count_{name}' for name in names]].nunique(axis=1) == 1
    df_merged['diff'] = df_merged[[f'diff_{names[i]}_{names[i+1]}' for i in range(len(names)-1)]].sum(axis=1) != 0

    return df_merged
diff_summary([df0105, df0105_id10x, df1217], 'epitope', ['0105', '010510x', '1217'])

Unnamed: 0,epitope,count_0105,count_010510x,count_1217,diff_0105_010510x,diff_010510x_1217,equal,diff
122,GILGFVFTL,795,694.0,780.0,101.0,-86.0,False,True
196,KLGGALQAK,233,233.0,233.0,0.0,0.0,True,False
427,SPRWYFYYL,218,189.0,89.0,29.0,100.0,False,True
329,NLVPMVATV,154,154.0,4.0,0.0,150.0,False,True
466,TTDPSFLGRY,114,114.0,10.0,0.0,104.0,False,True
...,...,...,...,...,...,...,...,...
221,KRWIILGLNK,1,1.0,1.0,0.0,0.0,True,False
220,KQWLVWLLL,1,1.0,1.0,0.0,0.0,True,False
218,KQLPFFYYS,1,0.0,0.0,0.0,0.0,False,False
216,KQIYKTPPI,1,1.0,0.0,0.0,0.0,False,False


In [28]:

ep_count_1217 = df1217.value_counts('epitope').reset_index()
ep_count_0105 = df0105.value_counts('epitope').reset_index()
ep_count_010510x = df0105_id10x.value_counts('epitope').reset_index()

intermed_count = ep_count_0105.merge(ep_count_1217, how='left', on='epitope', suffixes=('_0105', '_1217'))
intermed_count.merge(ep_count_010510x, how='left', on='epitope').rename(columns={'count':'count_0105wo10x'})

Unnamed: 0,epitope,count_0105,count_1217,count_0105wo10x
0,GILGFVFTL,795,780.0,694.0
1,KLGGALQAK,233,233.0,233.0
2,SPRWYFYYL,218,89.0,189.0
3,NLVPMVATV,154,4.0,154.0
4,TTDPSFLGRY,114,10.0,114.0
...,...,...,...,...
553,KRWIILGLNK,1,1.0,1.0
554,KQWLVWLLL,1,1.0,1.0
555,KQLPFFYYS,1,,
556,KQIYKTPPI,1,,1.0


In [None]:
diff_ep = ep_count_0105[~ep_count_0105['epitope'].isin(ep_count_1217['epitope'])].shape[0]
equal_ep = ep_count_0105[ep_count_0105['epitope'].isin(ep_count_1217['epitope'])].shape[0]

print(f'Number of equal epitopes: {equal_ep}')
print(f'Number of different epitopes: {diff_ep}')

Number of equal epitopes: 233
Number of different epitopes: 325


In [None]:
# Difference of peptides counts
ep_count_0105.merge(ep_count_1217, on='epitope', suffixes=('_0105', '_1217')).set_index('epitope').diff(axis=1).abs().dropna(axis=1).reset_index().sort_values('count_1217', ascending=False)

Unnamed: 0,epitope,count_1217
3,NLVPMVATV,150
2,SPRWYFYYL,129
4,TTDPSFLGRY,104
6,KTFPPTEPK,87
5,YLQPRTFLL,42
...,...,...
103,RYPLTFGWCF,0
104,RQWGPDPAAV,0
105,VLGSLAATV,0
106,VFLVLLPLV,0


In [38]:
len(ep_count_1217['epitope'].unique())

233