### Filter the results of GISTIC2 by CNA driver genes identified by COSMIC census and NG paper https://doi.org/10.1038/s41588-018-0331-5


In [3]:
import os
import pandas as pd

# Read known driver genes from a CSV file
known_driver_genes = pd.read_csv("/Users/wu04/Library/CloudStorage/OneDrive-UniversityofCambridge/projects/BO_gene_list/data/GISTIC/Census_allMon May 13 18_04_11 2024.csv")

# Filter known driver genes based on mutation types -- only keep CNA driver genes
known_driver_genes = known_driver_genes[known_driver_genes['Mutation Types'].str.contains('A|D', na=False)]

# Extract the gene symbols from the filtered known driver genes
known_driver_genes = known_driver_genes["Gene Symbol"].to_list()

# Print the number of known driver genes
print("Number of known driver genes:", len(known_driver_genes))

# Define CNA driver genes identified by Alex
CNA_driver_genes_NG_paper = ['KRAS', 'GATA4', 'ERBB2', 'ERBB2', 'ERBB2', 'CCND3', 'CCND1', 'CCND1', 'CCNE1', 'MET', 'EGFR', 'MET', 'CDK6','CDK6','MDM2','GATA6','PPM1D','MYC','CDKN2A','ARID1A','APC','PTEN','SMAD4','AXIN1','CDH11']

# Print the number of CNA driver genes identified by Alex
print("Number of CNA driver genes identified by NG paper:", len(CNA_driver_genes_NG_paper))


Number of known driver genes: 97
Number of CNA driver genes identified by NG paper: 25


In [7]:
# Combine known driver genes and CNA driver genes identified by NG paper
combined_driver_genes = known_driver_genes + CNA_driver_genes_NG_paper

# Remove duplicates from the combined list
combined_driver_genes_ = list(dict.fromkeys(combined_driver_genes))

# Print the combined driver genes without duplicates
print("Combined driver genes without duplicates:", combined_driver_genes_)

# Print the number of combined driver genes without duplicates
print("Number of combined driver genes without duplicates:", len(combined_driver_genes_))


Combined driver genes without duplicates: ['ACVR1B', 'AKT2', 'AKT3', 'ALK', 'AMER1', 'APC', 'APOBEC3B', 'ARHGEF10', 'ARHGEF10L', 'ARID1A', 'ASPM', 'ATM', 'AXIN1', 'BAZ1A', 'BIRC3', 'BRCA1', 'BRCA2', 'CASP3', 'CASP9', 'CCNC', 'CCNE1', 'CDKN2A', 'CDKN2C', 'CPEB3', 'CTNNA1', 'DCC', 'DROSHA', 'EGFR', 'ERBB2', 'ERG', 'FADD', 'FANCA', 'FANCC', 'FANCD2', 'FBXO11', 'FBXW7', 'FLT4', 'GOLPH3', 'GPC3', 'GPC5', 'GRM3', 'HGF', 'IGF2BP2', 'IKZF1', 'IKZF3', 'JUN', 'KAT6A', 'KDM6A', 'LARP4B', 'LATS2', 'LEPROTL1', 'LMO1', 'LRP1B', 'MAP2K4', 'MDM2', 'MDM4', 'MEN1', 'MGMT', 'MITF', 'MLH1', 'MSH2', 'MYC', 'MYCL', 'MYCN', 'N4BP2', 'NF1', 'NF2', 'NKX2-1', 'NSD3', 'NTRK1', 'PAX5', 'PBRM1', 'PPM1D', 'PRDM1', 'PTEN', 'PTPRD', 'RAD17', 'RAD50', 'RAF1', 'RAP1B', 'RB1', 'REL', 'RFWD3', 'SETD1B', 'SMAD4', 'SMARCB1', 'SOX2', 'SOX21', 'STK11', 'SUB1', 'SUFU', 'TNFAIP3', 'TSC1', 'TSC2', 'USP44', 'VHL', 'WT1', 'KRAS', 'GATA4', 'CCND3', 'CCND1', 'MET', 'CDK6', 'GATA6', 'CDH11']
Number of combined driver genes without d

### BE_samples_205

In [8]:
# Set the source path
source_path = '/Users/wu04/Library/CloudStorage/OneDrive-UniversityofCambridge/projects/BO_gene_list/results/gistic'

# Define the file path for the amp_gene_list
amp_gene_list_path = os.path.join(source_path, "samples_205_excludeGM/conf_75_q_25_no_ploidy_adjust_203_samples/amp_gene_list_widen.txt")

# Read the amp_gene_list from the file
amp_gene_list = pd.read_csv(amp_gene_list_path, header=None, sep="\t")
amp_gene_list = amp_gene_list[0].to_list()

# Print the number of amp_gene_list
print("The number of amp_gene_list:", len(amp_gene_list))

# Find the overlapped genes between combined_driver_genes_ and amp_gene_list
amp_205_overlapped_genes = set(combined_driver_genes_).intersection(amp_gene_list)

# Print the number of overlapped genes
print("The number of overlapped genes:", len(amp_205_overlapped_genes))
print(amp_205_overlapped_genes)

The number of amp_gene_list: 1385
The number of overlapped genes: 5
{'CCND1', 'GATA6', 'CDK6', 'FADD', 'STK11'}


In [9]:
# Set the file path for the del_gene_list
del_gene_list_path = os.path.join(source_path, "samples_205_excludeGM/conf_75_q_25_no_ploidy_adjust_203_samples/del_gene_list.txt")

# Read the del_gene_list from the file
del_gene_list = pd.read_csv(del_gene_list_path, header=None, sep="\t")
del_gene_list = del_gene_list[0].to_list()

# Print the number of del_gene_list
print("The number of del_gene_list:", len(del_gene_list))

# Find the overlapped genes between combined_driver_genes_ and del_gene_list
del_205_overlapped_genes = set(combined_driver_genes_).intersection(del_gene_list)

# Print the number of overlapped genes
print("The number of overlapped genes:", len(del_205_overlapped_genes))
print(del_205_overlapped_genes)

The number of del_gene_list: 542
The number of overlapped genes: 4
{'DCC', 'ERG', 'SMAD4', 'CDKN2A'}


In [10]:
# Combine amp_gene_list and del_gene_list to create a gene_list for 205 samples
gene_list_205_samples = pd.Series(amp_gene_list + del_gene_list).drop_duplicates().tolist()
print("The number of genes in the gene_list for 205 samples:", len(gene_list_205_samples))

# Combine amp_205_overlapped_genes and del_205_overlapped_genes to create a list of overlapped genes for 205 samples
overlapped_gene_list_205_samples = list(amp_205_overlapped_genes.union(del_205_overlapped_genes))
print("The number of overlapped genes in the gene_list for 205 samples:", len(overlapped_gene_list_205_samples))
print(overlapped_gene_list_205_samples)

The number of genes in the gene_list for 205 samples: 1853
The number of overlapped genes in the gene_list for 205 samples: 9
['CCND1', 'GATA6', 'ERG', 'CDK6', 'CDKN2A', 'FADD', 'DCC', 'STK11', 'SMAD4']


In [12]:
overlapped_gene_list_205_samples = pd.DataFrame(overlapped_gene_list_205_samples, columns = ['gene_name'])
overlapped_gene_list_205_samples = overlapped_gene_list_205_samples[overlapped_gene_list_205_samples['gene_name'] != 'FHIT']
print("The number of filtered overlapped gene_list in 205 samples:", len(overlapped_gene_list_205_samples))


overlapped_gene_list_205_samples.to_csv(os.path.join(source_path, "samples_205_excludeGM/conf_95_overlapped_CNA_drive_genes.csv"), index = False)
# with open (os.path.join(source_path, "samples_205_excludeGM/conf_95_overlapped_drive_genes.txt"), 'w') as file:
#     for item in overlapped_gene_list_205_samples:
#         file.write("%s\n" %item)

The number of filtered overlapped gene_list in 205 samples: 9


In [13]:
overlapped_gene_list_205_samples

Unnamed: 0,gene_name
0,CCND1
1,GATA6
2,ERG
3,CDK6
4,CDKN2A
5,FADD
6,DCC
7,STK11
8,SMAD4


### Genomics_England data

In [15]:
gel_amp_gene_list_path = os.path.join(source_path, "gel/conf_75_q_25_no_ploidy_adjust/amp_gene_list_widen.txt")

gel_amp_gene_list = pd.read_csv(gel_amp_gene_list_path, header=None, sep="\t")
gel_amp_gene_list = gel_amp_gene_list[0].to_list()
print("The number of gel amp genes:", len(gel_amp_gene_list))

amp_gel_overlapped_genes = set(combined_driver_genes_).intersection(gel_amp_gene_list)
print("Overlapped genes:", amp_gel_overlapped_genes)
print("The number of overlapped genes:", len(amp_gel_overlapped_genes))

The number of gel amp genes: 400
Overlapped genes: {'GATA6'}
The number of overlapped genes: 1


In [16]:
gene_list_path = os.path.join(source_path, "gel/conf_75_q_25_no_ploidy_adjust/del_gene_list.txt")

gel_del_gene_list = pd.read_csv(gene_list_path, header=None, sep="\t")
gel_del_gene_list = gel_del_gene_list[0].to_list()
print("The number of gel del genes:", len(gel_del_gene_list))

del_gel_overlapped_genes = set(combined_driver_genes_).intersection(gel_del_gene_list)
print("Overlapped genes:", del_gel_overlapped_genes)
print("The number of overlapped genes:", len(del_gel_overlapped_genes))

The number of gel del genes: 1366
Overlapped genes: {'MEN1', 'CCND1', 'PTPRD', 'PAX5', 'CDKN2A', 'FANCC', 'FADD', 'TSC1'}
The number of overlapped genes: 8


### In the final selection, STK11, FADD, DCC, MEN1, PAX5, TSC1, ATM, WT1, CDKN2A, CCND1, FADD, PTPRD, LMO1, FANCC, and BIRC3 were excluded due to overlapping peaks. ### ### ### ### Consequently, the final list of CNA driver genes identified includes GATA6, CCND1, CDK6, SMAD4, and ERG.