## 1. Feature engineering and File cleaning
This notebook merges the files obtained by iedb_runner.py script and joins the resulting dataframe with the catalog LIBCE/ProteomaGS/Cat_ProteomaGS.csv, obtaining the resulting file 

In [1]:
import pandas as pd
import glob
import os

# Path to the directory where long results from the iedb_runner.py script are stored.
directory_path = '../LIBCE_RESULTS/RESULTS-ProteomaGS-MOUSE-19-12-2025/results_long_ProteomaGS'
file_pattern = os.path.join(directory_path, '*.csv')
all_files = glob.glob(file_pattern)

all_files[:5]

['../LIBCE_RESULTS/RESULTS-ProteomaGS-MOUSE-19-12-2025/results_long_ProteomaGS/GL50581_3377_RESULTS_LONG.csv',
 '../LIBCE_RESULTS/RESULTS-ProteomaGS-MOUSE-19-12-2025/results_long_ProteomaGS/GL50581_3020_RESULTS_LONG.csv',
 '../LIBCE_RESULTS/RESULTS-ProteomaGS-MOUSE-19-12-2025/results_long_ProteomaGS/GL50581_3755_RESULTS_LONG.csv',
 '../LIBCE_RESULTS/RESULTS-ProteomaGS-MOUSE-19-12-2025/results_long_ProteomaGS/GL50581_3353_RESULTS_LONG.csv',
 '../LIBCE_RESULTS/RESULTS-ProteomaGS-MOUSE-19-12-2025/results_long_ProteomaGS/GL50581_4392_RESULTS_LONG.csv']

In [2]:
# Read all files into a list of DataFrames
all_dataframes = [pd.read_csv(f) for f in all_files]

# Concatenate all DataFrames
combined_df = pd.concat(all_dataframes, axis=0, ignore_index=True)

In [3]:
combined_df.head()

Unnamed: 0,allele,seq_num,start,end,length,core_peptide,peptide,ic50,rank,adjusted_rank,protein
0,H2-IEd,1,67,81,15,GVLKSQRAR,FPMYQGVLKSQRARL,680.1,1.9,1.9,GL50581_3377
1,H2-IAk,1,57,71,15,DAQGFPMYQ,RITGGNDAQGFPMYQ,3741.2,1.9,1.9,GL50581_3377
2,H2-IAk,1,58,72,15,DAQGFPMYQ,ITGGNDAQGFPMYQG,3979.9,2.3,2.3,GL50581_3377
3,H2-IAk,1,59,73,15,DAQGFPMYQ,TGGNDAQGFPMYQGV,4032.0,2.4,2.4,GL50581_3377
4,H2-IEd,1,88,102,15,CYLHRRNGE,KCYLHRRNGERKRRS,795.6,2.5,2.5,GL50581_3377


## 2. Catalog join
In order to have the file ready for analysis, this section of the notebook will join the combined dataframe to the catalog

In [None]:
# read the catalog CSV generated in catalog_cleaning.ipynb
cat_path = './ProteomaGS/Cat_ProteomaGS_modified.csv'
cat_df = pd.read_csv(cat_path, low_memory=False)  

print("cat_df shape:", cat_df.shape)
display(cat_df.head())

cat_df shape: (4562, 12)


Unnamed: 0,Gene ID,source_id,Organism,Product Description,Protein Length,Molecular Weight,Superfamily Description,# TM Domains,Computed GO Functions,Secretome,Cyst,Trophozoite
0,GL50581_1,GL50581_1-t26_1,Giardia Assemblage B isolate GS,VSP [Source:UniProtKB/TrEMBL;Acc:C6LMQ7],407.0,41559.0,Growth factor receptor cysteine-rich domain su...,1.0,,0,0,1
1,GL50581_100,GL50581_100-t26_1,Giardia Assemblage B isolate GS,Hypothetical protein,236.0,28108.0,,0.0,,0,0,1
2,GL50581_1000,GL50581_1000-t26_1,Giardia Assemblage B isolate GS,CLP1_P domain-containing protein [Source:UniPr...,604.0,66708.0,P-loop containing nucleoside triphosphate hydr...,0.0,ATP binding,0,0,1
3,GL50581_1001,GL50581_1001-t26_1,Giardia Assemblage B isolate GS,Hypothetical protein,832.0,93564.0,,0.0,,0,0,1
4,GL50581_1002,GL50581_1002-t26_1,Giardia Assemblage B isolate GS,"Kinase, NEK [Source:UniProtKB/TrEMBL;Acc:C6LQH5]",678.0,76036.0,Ankyrin repeat-containing domain superfamily;P...,0.0,ATP binding;protein binding;protein kinase act...,0,0,1


In [None]:
# Join combined df with catalog df 
merged_df = combined_df.merge(cat_df, left_on='protein', right_on='Gene ID', how='left', suffixes=('', '_cat'))

print("merged_df shape:", merged_df.shape)
display(merged_df.head())

merged_df shape: (13345600, 23)


Unnamed: 0,allele,seq_num,start,end,length,core_peptide,peptide,ic50,rank,adjusted_rank,...,Organism,Product Description,Protein Length,Molecular Weight,Superfamily Description,# TM Domains,Computed GO Functions,Secretome,Cyst,Trophozoite
0,H2-IEd,1,67,81,15,GVLKSQRAR,FPMYQGVLKSQRARL,680.1,1.9,1.9,...,Giardia Assemblage B isolate GS,40S ribosomal protein S6 [Source:UniProtKB/TrE...,248.0,27948.0,,0.0,structural constituent of ribosome,0,0,1
1,H2-IAk,1,57,71,15,DAQGFPMYQ,RITGGNDAQGFPMYQ,3741.2,1.9,1.9,...,Giardia Assemblage B isolate GS,40S ribosomal protein S6 [Source:UniProtKB/TrE...,248.0,27948.0,,0.0,structural constituent of ribosome,0,0,1
2,H2-IAk,1,58,72,15,DAQGFPMYQ,ITGGNDAQGFPMYQG,3979.9,2.3,2.3,...,Giardia Assemblage B isolate GS,40S ribosomal protein S6 [Source:UniProtKB/TrE...,248.0,27948.0,,0.0,structural constituent of ribosome,0,0,1
3,H2-IAk,1,59,73,15,DAQGFPMYQ,TGGNDAQGFPMYQGV,4032.0,2.4,2.4,...,Giardia Assemblage B isolate GS,40S ribosomal protein S6 [Source:UniProtKB/TrE...,248.0,27948.0,,0.0,structural constituent of ribosome,0,0,1
4,H2-IEd,1,88,102,15,CYLHRRNGE,KCYLHRRNGERKRRS,795.6,2.5,2.5,...,Giardia Assemblage B isolate GS,40S ribosomal protein S6 [Source:UniProtKB/TrE...,248.0,27948.0,,0.0,structural constituent of ribosome,0,0,1


In [6]:
out_csv = '../LIBCE_RESULTS/RESULTS-ProteomaGS-MOUSE-19-12-2025/GL50581_Combined.csv'
merged_df.to_csv(out_csv, index=False)
print(f"merged_df saved to {out_csv}")


merged_df saved to ../LIBCE_RESULTS/RESULTS-ProteomaGS-MOUSE-19-12-2025/GL50581_Combined.csv
