### Read ligandome of healthy tissues (shared by Cerullo's laboratory)

In [None]:
import numpy as np
import pandas as pd

ligandome = pd.read_csv("PATH_TO_HEALTHY_TISSUE_LIGANDOME.csv", index_col=0)
print(ligandome.shape)
ligandome.head()

(18448, 5)


Unnamed: 0,peptide_sequence_id,peptide_sequence,hla_class,tissues,Gene_names
1,1,LLPKKTESHHKAKGK,HLA-II,"Adrenal gland,Aorta,Bladder,Bone marrow,Brain,...","H2AC18, H2AC19, H2AC7, H2AC4, H2AC8, H2AC6, H2..."
2,3,PEPAKSAPAPKKGSKKAVTK,HLA-II,"Adrenal gland,Aorta,Bladder,Bone marrow,Brain,...","H2BC11, H2BC6, H2BC10, H2BC8, H2BC7, H2BC4, H2..."
3,6,RKSNAAERRGPL,HLA-II,"Adrenal gland,Bone marrow,Cerebellum,Heart,Kid...",HLA-DRA
4,10,AKGKKVAPAPAVVK,HLA-II,"Adrenal gland,Colon,Lung,Ovary,Small intestine...",RPL7A
5,16,FTRILKHGAKDKDD,HLA-II,"Adrenal gland,Aorta,Bladder,Bone marrow,Brain,...","MYL12B, MYL12A, MYL9"


### Filter to HLA-II ligands found in healthy ovary

In [2]:
ovary_ligandome = ligandome[ligandome['tissues'].str.contains('Ovary')]
print(ovary_ligandome.shape)

hlaii_ovary_ligandome = ovary_ligandome[ovary_ligandome['hla_class'].str.contains('HLA-II')]
print(hlaii_ovary_ligandome.shape)
hlaii_ovary_ligandome.head()

(18448, 5)
(4330, 5)


Unnamed: 0,peptide_sequence_id,peptide_sequence,hla_class,tissues,Gene_names
1,1,LLPKKTESHHKAKGK,HLA-II,"Adrenal gland,Aorta,Bladder,Bone marrow,Brain,...","H2AC18, H2AC19, H2AC7, H2AC4, H2AC8, H2AC6, H2..."
2,3,PEPAKSAPAPKKGSKKAVTK,HLA-II,"Adrenal gland,Aorta,Bladder,Bone marrow,Brain,...","H2BC11, H2BC6, H2BC10, H2BC8, H2BC7, H2BC4, H2..."
3,6,RKSNAAERRGPL,HLA-II,"Adrenal gland,Bone marrow,Cerebellum,Heart,Kid...",HLA-DRA
4,10,AKGKKVAPAPAVVK,HLA-II,"Adrenal gland,Colon,Lung,Ovary,Small intestine...",RPL7A
5,16,FTRILKHGAKDKDD,HLA-II,"Adrenal gland,Aorta,Bladder,Bone marrow,Brain,...","MYL12B, MYL12A, MYL9"


### Collect mapped genes for found ligands/peptides

In [3]:
hlaii_ovary_ligandome_genes = hlaii_ovary_ligandome['Gene_names'].str.split(',').explode().str.strip().unique()
print(len(hlaii_ovary_ligandome_genes))
print(hlaii_ovary_ligandome_genes)

1099
['H2AC18' 'H2AC19' 'H2AC7' ... 'TUBA4B' 'TUBA3E' 'RPS15A']


### Read the list of genes which mapped peptides were found to be uniquely enriched in MHC-II cancer high group of samples

In [None]:
from pathlib import Path

# Reads text and splits by line breaks automatically
only_mhcii_high_enriched_genes = Path('PATH_TO_MAPPED_GENES.txt').read_text().splitlines()
print(len(only_mhcii_high_enriched_genes))

60


### Check what genes are overlapping with genes found in healthy ovary and remove them to get the list of genes only associated with malignancy in MHC-II high cancer samples

In [5]:
overlap = list(set(hlaii_ovary_ligandome_genes) & set(only_mhcii_high_enriched_genes))
print(len(overlap))

only_mhcii_high_enriched_genes_compared_to_healthy_ovary = list(set(only_mhcii_high_enriched_genes) - set(overlap))
print(len(only_mhcii_high_enriched_genes_compared_to_healthy_ovary))
print(only_mhcii_high_enriched_genes_compared_to_healthy_ovary)

25
35
['SGPL1', 'EIF4H', 'UBA52', 'SNCG', 'TNS1', 'SCAMP1', 'BCL2L13', 'FAM168B', 'CIRBP', 'YAP1', 'RPSA', 'MYO1C', 'MAP1S', 'ZC3HAV1', 'ERBIN', 'FTL', 'KRT1', 'CLINT1', 'GSK3B', 'RIPK1', 'PTBP3', 'S100A11', 'CDV3', 'PKP2', 'CLNS1A', 'RPLP0', 'HMGA1', 'VCL', 'TPR', 'TFG', 'HCFC1', 'PDLIM1', 'MMP14', 'TWF2', 'HCLS1']


In [None]:
# Specify the file path
file_path = 'PATH_TO_OUTPUT/only_mhcii_high_enriched_genes_based_on_prop_compared_to_healthy_ovary.txt'

# Open the file in write mode
with open(file_path, 'w') as f:
    for item in only_mhcii_high_enriched_genes_compared_to_healthy_ovary:
        f.write(f"{item}\n")