In [43]:
import pandas as pd
from pathlib import Path
import numpy as np


In [44]:
def get_sig(df, gene):
    """Return significance value or None."""
    vals = df.loc[df["Gene(s)"] == gene, "Significance"]
    if len(vals) == 0:
        return None
    val = vals.iloc[0]
    return None if (isinstance(val, float) and np.isnan(val)) else val

severity_rank = {
    "Pathogenic": 6,
    "Likely pathogenic": 5,
    'Pathogenic/likely pathogenic/pathogenic, low penetrance': 4,
    "Risk factor": 4,
    "Association": 3,
    "Uncertain significance": 2,
    "Uncertain risk allele": 2,
    "Not provided": 1,
    None: 0,
}

def score_gene(info):
    return max(
        severity_rank.get(info["son"], 0),
        severity_rank.get(info["mother"], 0),
        severity_rank.get(info["father"], 0),
    )


In [48]:
son_snp_df = pd.read_csv(Path("data/hg002-son-snp.csv"), delimiter=",")
father_snp_df = pd.read_csv(Path("data/hg003-father-snp.csv"), delimiter=",")
mother_snp_df = pd.read_csv(Path("data/hg004-mother-snp.csv"), delimiter=",")

son_genes = list(son_snp_df["Gene(s)"])
father_genes = list(father_snp_df["Gene(s)"])
mother_genes = list(mother_snp_df["Gene(s)"])

# overlap_gene_set = set(son_genes) & set(father_genes) & set(mother_genes)
# print("len of overlapping genes", len(overlap_gene_set))

# sig_values = []
# for gene in overlap_gene_set:
#     significance = son_snp_df.loc[son_snp_df["Gene(s)"] == gene, "Significance"].iloc[0]
#     if significance not in sig_values:
#         sig_values.append(significance)
# print(sig_values)


# compute overlap
overlap_gene_set = set(son_genes) & set(father_genes) & set(mother_genes)
print("len of overlapping genes:", len(overlap_gene_set))

overlap_info = {}
sig_categories = []

for gene in overlap_gene_set:
    son_sig = get_sig(son_snp_df, gene)
    mother_sig = get_sig(mother_snp_df, gene)
    father_sig = get_sig(father_snp_df, gene)

    overlap_info[gene] = {
        "son": son_sig,
        "mother": mother_sig,
        "father": father_sig,
    }

    for val in (son_sig, mother_sig, father_sig):
        if val not in sig_categories:
            sig_categories.append(val)

print("Unique significance categories:", sig_categories)

for gene, info in overlap_info.items():
    overlap_info[gene]["score"] = score_gene(info)

# sort genes
ranked_genes = sorted(
    overlap_info.items(),
    key=lambda x: x[1]["score"],
    reverse=True
)

# print top 10
print("\n Most significant overlapping genes:")
for g, info in ranked_genes:
    print(g, info)

len of overlapping genes: 32
Unique significance categories: ['Likely pathogenic', None, 'Association', 'Not provided', 'Uncertain significance', 'Pathogenic', 'Risk factor', 'Pathogenic/likely pathogenic/pathogenic, low penetrance', 'Uncertain risk allele']

 Most significant overlapping genes:
PERM1 {'son': 'Pathogenic', 'mother': 'Pathogenic', 'father': 'Pathogenic', 'score': 6}
GBP1 {'son': 'Likely pathogenic', 'mother': 'Likely pathogenic', 'father': 'Likely pathogenic', 'score': 5}
ECE1 {'son': 'Risk factor', 'mother': 'Risk factor', 'father': 'Risk factor', 'score': 4}
ABCA4 {'son': None, 'mother': None, 'father': 'Pathogenic/likely pathogenic/pathogenic, low penetrance', 'score': 4}
CHI3L1 {'son': 'Risk factor', 'mother': 'Risk factor', 'father': 'Risk factor', 'score': 4}
FCGR2B {'son': 'Risk factor', 'mother': 'Risk factor', 'father': 'Risk factor', 'score': 4}
MGST3 {'son': 'Association', 'mother': 'Association', 'father': 'Association', 'score': 3}
BGLAP, PAQR6 {'son': 'Ass