# Catalog Cleaning
This notebook does the feature engineering for the Catalog of the proteome (i.e. './ProteomaGS/Cat_ProteomaGS.csv'), more specifically it adds the columns Cyst, Trophozoite and Secretome (based on the Secretome and Cyst files)

In [1]:
import pandas as pd
import os 
import glob

In [None]:
# read the catalog CSV into a new dataframe 
cat_path = './ProteomaWB/Cat_ProteomaGS.csv'
cat_df = pd.read_csv(cat_path, low_memory=False)

print("cat_df shape:", cat_df.shape)
display(cat_df.head())

cat_df shape: (5391, 9)


Unnamed: 0,Gene ID,source_id,Organism,Product Description,Protein Length,Molecular Weight,Superfamily Description,# TM Domains,Computed GO Functions
0,GL50803_0010013,GL50803_0010013_t1,Giardia Assemblage A isolate WB 2019,Qc-SNARE 3,220.0,24462.0,,1.0,
1,GL50803_0010014,GL50803_0010014_t1,Giardia Assemblage A isolate WB 2019,unspecified product,378.0,43479.0,,0.0,
2,GL50803_0010016,GL50803_0010016_t1,Giardia Assemblage A isolate WB 2019,Phasin superfamily protein,311.0,34514.0,Thioredoxin-like superfamily,0.0,
3,GL50803_0010019,GL50803_0010019_t1,Giardia Assemblage A isolate WB 2019,putative Phospholipid-transporting ATPase IA,1560.0,173785.0,"HAD-like superfamily;P-type ATPase, A domain s...",8.0,ATP binding;ATPase-coupled intramembrane lipid...
4,GL50803_0010025,GL50803_0010025_t1,Giardia Assemblage A isolate WB 2019,unspecified product,241.0,27232.0,,0.0,


### Add Secretome Column

In [None]:
secretome_dir = '/home/carlos/LIBCE/ProteomaGS/Secretome'
secretome_pattern = os.path.join(secretome_dir, '*.xls*')
secretome_files = glob.glob(secretome_pattern)

if not secretome_files:
    raise FileNotFoundError(f"No Excel files found in {secretome_dir!r}")

# Read all Excel files (first sheet, headers in first row) and concatenate
secretome_dfs = [pd.read_excel(f, sheet_name=0) for f in secretome_files]
secretome_df = pd.concat(secretome_dfs, axis=0, ignore_index=True)

print(f"Read {len(secretome_files)} files -> concatenated shape: {secretome_df.shape}")
display(secretome_df.head())

Read 4 files -> concatenated shape: (195, 10)


Unnamed: 0,Gene ID,source_id,Organism,Product Description,Molecular Weight,# TM Domains,Protein Length,Computed GO Functions,Computed GO Processes,Superfamily Description
0,GL50803_0010423,GL50803_0010423_t1,Giardia Assemblage A isolate WB 2019,Activator of Hsp90 ATPase,16827,0,147,ATPase activator activity;Hsp90 protein bindin...,,"Activator of Hsp90 ATPase, Aha1"
1,GL50803_0010623,GL50803_0010623_t1,Giardia Assemblage A isolate WB 2019,Phosphoenolpyruvate carboxykinase,73845,0,654,GTP binding;phosphoenolpyruvate carboxykinase ...,gluconeogenesis,"Phosphoenolpyruvate carboxykinase, N-terminal"
2,GL50803_00113038,GL50803_00113038_t1,Giardia Assemblage A isolate WB 2019,Tenascin-like protein,60841,0,574,calcium ion binding,,
3,GL50803_00113304,GL50803_00113304_t1,Giardia Assemblage A isolate WB 2019,VSP,64673,1,636,,,Growth factor receptor cysteine-rich domain su...
4,GL50803_00113416,GL50803_00113416_t1,Giardia Assemblage A isolate WB 2019,High cysteine membrane protein TMK-like,261974,1,2516,,,Growth factor receptor cysteine-rich domain su...


In [None]:
secretome_gene_ids = secretome_df['Gene ID'].unique().tolist()
print(f"Number of unique Gene IDs: {len(secretome_gene_ids)}")
print(secretome_gene_ids[:10])  

Number of unique Gene IDs: 195
['GL50803_0010423', 'GL50803_0010623', 'GL50803_00113038', 'GL50803_00113304', 'GL50803_00113416', 'GL50803_00113553', 'GL50803_00113677', 'GL50803_00114246', 'GL50803_0011470', 'GL50803_00115202']


In [7]:
# mark entries present in secretome_gene_ids
cat_df['Secretome'] = cat_df['Gene ID'].isin(secretome_gene_ids).astype(int)

# quick check
print("Secretome counts:\n", cat_df['Secretome'].value_counts())
display(cat_df.head())

Secretome counts:
 Secretome
0    5196
1     195
Name: count, dtype: int64


Unnamed: 0,Gene ID,source_id,Organism,Product Description,Protein Length,Molecular Weight,Superfamily Description,# TM Domains,Computed GO Functions,Secretome
0,GL50803_0010013,GL50803_0010013_t1,Giardia Assemblage A isolate WB 2019,Qc-SNARE 3,220.0,24462.0,,1.0,,0
1,GL50803_0010014,GL50803_0010014_t1,Giardia Assemblage A isolate WB 2019,unspecified product,378.0,43479.0,,0.0,,0
2,GL50803_0010016,GL50803_0010016_t1,Giardia Assemblage A isolate WB 2019,Phasin superfamily protein,311.0,34514.0,Thioredoxin-like superfamily,0.0,,0
3,GL50803_0010019,GL50803_0010019_t1,Giardia Assemblage A isolate WB 2019,putative Phospholipid-transporting ATPase IA,1560.0,173785.0,"HAD-like superfamily;P-type ATPase, A domain s...",8.0,ATP binding;ATPase-coupled intramembrane lipid...,0
4,GL50803_0010025,GL50803_0010025_t1,Giardia Assemblage A isolate WB 2019,unspecified product,241.0,27232.0,,0.0,,0


### Add Cyst column

In [None]:
cyst_dir = '/home/carlos/LIBCE/ProteomaGS/Cyst'
cyst_pattern = os.path.join(cyst_dir, '*.xls*')
cyst_files = glob.glob(cyst_pattern)

if not cyst_files:
    raise FileNotFoundError(f"No Excel files found in {secretome_dir!r}")

# Read all Excel files (first sheet, headers in first row) and concatenate
cyst_dfs = [pd.read_excel(f, sheet_name=0) for f in cyst_files]
cyst_df = pd.concat(cyst_dfs, axis=0, ignore_index=True)

print(f"Read {len(cyst_files)} files -> concatenated shape: {cyst_df.shape}")
display(cyst_df.head())

Read 1 files -> concatenated shape: (116, 10)


Unnamed: 0,Gene ID,source_id,Organism,Product Description,Molecular Weight,# TM Domains,Protein Length,Computed GO Functions,Computed GO Processes,Superfamily Description
0,GL50803_00102963,GL50803_00102963_t1,Giardia Assemblage A isolate WB 2019,tRNA 2-methylthioadenosine synthase,58322,0,525,"4 iron, 4 sulfur cluster binding;catalytic act...",,
1,GL50803_0010330,GL50803_0010330_t1,Giardia Assemblage A isolate WB 2019,Tenascin-like protein,28090,0,257,,,
2,GL50803_0010358,GL50803_0010358_t1,Giardia Assemblage A isolate WB 2019,A-type flavoprotein,46622,0,414,FMN binding;electron transfer activity;metal i...,,Flavoprotein-like superfamily;Ribonuclease Z/H...
3,GL50803_0010367,GL50803_0010367_t1,Giardia Assemblage A isolate WB 2019,Ribosomal protein S24,14844,0,132,structural constituent of ribosome,translation,Ribosomal protein L23/L15e core domain superfa...
4,GL50803_0010661,GL50803_0010661_t1,Giardia Assemblage A isolate WB 2019,Ubiquitin-conjugating enzyme E1,121422,0,1092,ubiquitin-like modifier activating enzyme acti...,cellular protein modification process,Ubiquitin-activating enzyme


In [9]:
cyst_gene_ids = cyst_df['Gene ID'].unique().tolist()
print(f"Number of unique Gene IDs: {len(cyst_gene_ids)}")

Number of unique Gene IDs: 116


In [10]:
# mark entries present in secretome_gene_ids
cat_df['Cyst'] = cat_df['Gene ID'].isin(cyst_gene_ids).astype(int)

# quick check
print("Cyst counts:\n", cat_df['Cyst'].value_counts())
display(cat_df.head())

Cyst counts:
 Cyst
0    5275
1     116
Name: count, dtype: int64


Unnamed: 0,Gene ID,source_id,Organism,Product Description,Protein Length,Molecular Weight,Superfamily Description,# TM Domains,Computed GO Functions,Secretome,Cyst
0,GL50803_0010013,GL50803_0010013_t1,Giardia Assemblage A isolate WB 2019,Qc-SNARE 3,220.0,24462.0,,1.0,,0,0
1,GL50803_0010014,GL50803_0010014_t1,Giardia Assemblage A isolate WB 2019,unspecified product,378.0,43479.0,,0.0,,0,0
2,GL50803_0010016,GL50803_0010016_t1,Giardia Assemblage A isolate WB 2019,Phasin superfamily protein,311.0,34514.0,Thioredoxin-like superfamily,0.0,,0,0
3,GL50803_0010019,GL50803_0010019_t1,Giardia Assemblage A isolate WB 2019,putative Phospholipid-transporting ATPase IA,1560.0,173785.0,"HAD-like superfamily;P-type ATPase, A domain s...",8.0,ATP binding;ATPase-coupled intramembrane lipid...,0,0
4,GL50803_0010025,GL50803_0010025_t1,Giardia Assemblage A isolate WB 2019,unspecified product,241.0,27232.0,,0.0,,0,0


## Add Trophozoite column

In [11]:
# create Trophozoite column: 1 when Cyst == 0, otherwise 0
cat_df['Trophozoite'] = (cat_df['Cyst'] == 0).astype(int)

# quick check
print("Trophozoite value counts:\n", cat_df['Trophozoite'].value_counts())
display(cat_df.head())

Trophozoite value counts:
 Trophozoite
1    5275
0     116
Name: count, dtype: int64


Unnamed: 0,Gene ID,source_id,Organism,Product Description,Protein Length,Molecular Weight,Superfamily Description,# TM Domains,Computed GO Functions,Secretome,Cyst,Trophozoite
0,GL50803_0010013,GL50803_0010013_t1,Giardia Assemblage A isolate WB 2019,Qc-SNARE 3,220.0,24462.0,,1.0,,0,0,1
1,GL50803_0010014,GL50803_0010014_t1,Giardia Assemblage A isolate WB 2019,unspecified product,378.0,43479.0,,0.0,,0,0,1
2,GL50803_0010016,GL50803_0010016_t1,Giardia Assemblage A isolate WB 2019,Phasin superfamily protein,311.0,34514.0,Thioredoxin-like superfamily,0.0,,0,0,1
3,GL50803_0010019,GL50803_0010019_t1,Giardia Assemblage A isolate WB 2019,putative Phospholipid-transporting ATPase IA,1560.0,173785.0,"HAD-like superfamily;P-type ATPase, A domain s...",8.0,ATP binding;ATPase-coupled intramembrane lipid...,0,0,1
4,GL50803_0010025,GL50803_0010025_t1,Giardia Assemblage A isolate WB 2019,unspecified product,241.0,27232.0,,0.0,,0,0,1


## Save file

In [None]:
cat_df.to_csv('./ProteomaGS/Cat_ProteomaGS_modified.csv', index=False)
print(f"Saved cat_df to: {cat_path!r} (shape: {cat_df.shape})")

Saved cat_df to: './ProteomaWB/Cat_ProteomaWB.csv' (shape: (5391, 12))
