In [17]:
import pandas as pd
import json

from pyspark.sql import SparkSession
from pyspark import SparkConf

conf = SparkConf()
conf.set("spark.driver.memory", "4g")  # Set to your desired heap size

spark = SparkSession.builder \
    .config(conf=conf) \
    .getOrCreate()


## Load genotypes

In [19]:
vcfLikePath='./als_1kg.exon.num.txt'

genotypes = pd.read_csv(vcfLikePath, sep='\t', index_col=['chrom', 'position'], engine="pyarrow")



In [None]:
genotypes.shape

                                                                                

(654394, 5923)

In [None]:
genotypes.columns

Index(['Gene', 'Func.knownGene', 'Func.refGene', 'Func.ensGene', 'ExonicFunc',
       'AAChange.knownGene', 'AAChange.refGene', 'AAChange.ensGene', 'FILTER',
       'REF',
       ...
       'CGND-HDA-02442', 'CGND-HDA-02438', 'CGND-HDA-02445', 'CGND-HDA-02693',
       'CGND-HDA-02446', 'CGND-HDA-02444', 'CGND-HDA-02439', 'CGND-HDA-02462',
       'CGND-HDA-02688', 'CGND-HDA-02440'],
      dtype='object', length=5923)

## Load random gene set

In [None]:
import random

def load_msigdb(file_name):
    """Load the MSigDB .gmt file and return a dictionary of gene sets."""
    pathways = {}
    with open(file_name, 'r') as file:
        for line in file:
            parts = line.strip().split('\t')
            pathway_name = parts[0]
            genes = parts[2:]
            pathways[pathway_name] = genes
    return pathways

def sample_random_gene_from_n_pathways(pathways, n):
    """Sample a random gene from n random pathways."""
    sampled_genes = []
    sampled_pathways = random.sample(list(pathways.keys()), n)

    for pathway in sampled_pathways:
        gene = random.choice(pathways[pathway])
        sampled_genes.append((pathway, gene))

    return sampled_genes

# Load MSigDB gene sets
file_name = "c2.cp.v2023.1.Hs.symbols.gmt"  # Replace with your downloaded file path
pathways = load_msigdb(file_name)

# Sample a random gene from n random pathways
n = 150  # Change this to your desired n
random_results = sample_random_gene_from_n_pathways(pathways, n)

for pathway, gene in random_results:
    print(f"Pathway: {pathway}, Gene: {gene}")


Pathway: KEGG_PEROXISOME, Gene: HAO2
Pathway: REACTOME_APEX1_INDEPENDENT_RESOLUTION_OF_AP_SITES_VIA_THE_SINGLE_NUCLEOTIDE_REPLACEMENT_PATHWAY, Gene: OGG1
Pathway: BIOCARTA_SALMONELLA_PATHWAY, Gene: WASF1
Pathway: REACTOME_SEMA3A_PLEXIN_REPULSION_SIGNALING_BY_INHIBITING_INTEGRIN_ADHESION, Gene: FES
Pathway: WP_LTF_DANGER_SIGNAL_RESPONSE_PATHWAY, Gene: IL1B
Pathway: REACTOME_REPRODUCTION, Gene: CD9
Pathway: WP_UREA_CYCLE_AND_RELATED_DISEASES, Gene: ASL
Pathway: REACTOME_EGFR_TRANSACTIVATION_BY_GASTRIN, Gene: KRAS
Pathway: PID_MYC_REPRESS_PATHWAY, Gene: EP300
Pathway: REACTOME_ZINC_TRANSPORTERS, Gene: SLC30A3
Pathway: REACTOME_RESOLUTION_OF_D_LOOP_STRUCTURES, Gene: ATM
Pathway: REACTOME_SIGNALING_BY_FGFR2_IIIA_TM, Gene: POLR2E
Pathway: REACTOME_SUMOYLATION_OF_INTRACELLULAR_RECEPTORS, Gene: PIAS1
Pathway: REACTOME_PKA_ACTIVATION_IN_GLUCAGON_SIGNALLING, Gene: ADCY8
Pathway: REACTOME_ACTIVATION_OF_NOXA_AND_TRANSLOCATION_TO_MITOCHONDRIA, Gene: TFDP2
Pathway: BIOCARTA_MHC_PATHWAY, Gene: TAP1
P

## Load gene sets

In [5]:
def readJsonFile(path):
    with open(path) as f:
        data = json.load(f)
    return data

In [14]:
import numpy as np
geneSets = {
    # 'alsod': ps.read_csv('./alsodList.csv', sep='\t')['Gene symbol'].tolist(),
    # 'cardiac': readJsonFile('./BIOCARTA_ALK_PATHWAY.v2023.1.Hs.json')['BIOCARTA_ALK_PATHWAY']['geneSymbols'] 
    #     + readJsonFile('./BIOCARTA_ACE2_PATHWAY.v2023.1.Hs.json')['BIOCARTA_ACE2_PATHWAY']['geneSymbols'] 
    #     + readJsonFile('./BIOCARTA_AT1R_PATHWAY.v2023.1.Hs.json')['BIOCARTA_AT1R_PATHWAY']['geneSymbols']
    #     + readJsonFile('./BIOCARTA_CARDIACEGF_PATHWAY.v2023.1.Hs.json')['BIOCARTA_CARDIACEGF_PATHWAY']['geneSymbols']
    #     + readJsonFile('./BIOCARTA_FLUMAZENIL_PATHWAY.v2023.1.Hs.json')['BIOCARTA_FLUMAZENIL_PATHWAY']['geneSymbols']
    #     + readJsonFile('./BIOCARTA_GCR_PATHWAY.v2023.1.Hs.json')['BIOCARTA_GCR_PATHWAY']['geneSymbols']
    #     + readJsonFile('./BIOCARTA_HDAC_PATHWAY.v2023.1.Hs.json')['BIOCARTA_HDAC_PATHWAY']['geneSymbols']
    #     + readJsonFile('./BIOCARTA_NFAT_PATHWAY.v2023.1.Hs.json')['BIOCARTA_NFAT_PATHWAY']['geneSymbols']
    #     + readJsonFile('./BIOCARTA_PGC1A_PATHWAY.v2023.1.Hs.json')['BIOCARTA_PGC1A_PATHWAY']['geneSymbols']
    #     + readJsonFile('./BIOCARTA_PITX2_PATHWAY.v2023.1.Hs.json')['BIOCARTA_PITX2_PATHWAY']['geneSymbols']
    #     + readJsonFile('./BIOCARTA_AMI_PATHWAY.v2023.1.Hs.json')['BIOCARTA_AMI_PATHWAY']['geneSymbols']
    #     + readJsonFile('./BIOCARTA_P53HYPOXIA_PATHWAY.v2023.1.Hs.json')['BIOCARTA_P53HYPOXIA_PATHWAY']['geneSymbols']
    #     + readJsonFile('./BIOCARTA_NO1_PATHWAY.v2023.1.Hs.json')['BIOCARTA_NO1_PATHWAY']['geneSymbols']
    #     + readJsonFile('./BIOCARTA_HIF_PATHWAY.v2023.1.Hs.json')['BIOCARTA_HIF_PATHWAY']['geneSymbols']
    "NUPs": pd.read_excel("../adhoc analysis/Variant_report_NUPs_fixed_2022-03-28.xlsx", sheet_name="all cases vs all controls")['Gene'].dropna().tolist(),
    "Random": np.array(random_results)[:,1].tolist()
        }

NameError: name 'pd' is not defined

In [12]:
for set in geneSets:
    print(geneSets[set])

['RCC1', 'RCC1', 'RCC1', 'RCC1', 'RCC1', 'RCC1', 'RCC1', 'RCC1', 'RCC1', 'RCC1', 'RCC1', 'RCC1', 'RCC1', 'NDC1', 'NDC1', 'NDC1', 'NDC1', 'NDC1', 'NDC1', 'NDC1', 'NDC1', 'NDC1', 'NDC1', 'NDC1', 'NDC1', 'NDC1', 'NDC1', 'NDC1', 'NDC1', 'NDC1', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'TPR', 'LBR', 'LBR', 'LBR', 'LBR', 'LBR', 'LBR', 'LBR', 'LBR', 'LBR', 'LBR', 'LBR', 'LBR', 'LBR', 'LBR', 'LBR', 'LBR', 'LBR', 'LBR', 'LBR', 'LBR', 'LBR', 'LBR', 'LBR', 'LBR', 'LBR', 'LBR', 'LBR', 'LBR', 'LBR', 'NUP133', 'NUP133', 'NUP133', 'NUP133', 'NUP133', 'NUP133', 'NUP133', 'NUP133', 'NUP133', 'NUP133', 'NUP133', 'NUP133', 'NUP133', 'NUP133', 'NUP133', 'NUP133', 'NUP133', 'NUP133', 'NUP133', 'N

## Filter gene sets

In [13]:
filteredGeneSets = {}
for setName in geneSets:
    filteredGeneSets[setName] = genotypes[(genotypes['Gene'].isin(geneSets[setName]) & (genotypes['ExonicFunc'] == 'nonsynonymous_SNV'))]


In [None]:
import subprocess

for setName in filteredGeneSets:
    print(f"{setName}: {filteredGeneSets[setName].shape}")
    filteredGeneSets[setName].reset_index().to_csv(f'./{setName}Genotypes', sep="\t", index=False)
    
   # Construct and run the concatenation command with awk to remove redundant headers
    cat_command = f"awk '(NR == 1) || (FNR > 1)' ./{setName}Genotypes/part*.csv > ./{setName}Genotypes.csv"
    subprocess.run(cat_command, shell=True, check=True)
    
    # Construct and run the removal command
    rm_command = f"rm -r ./{setName}Genotypes"
    subprocess.run(rm_command, shell=True, check=True)

