In [1]:
import pandas as pd
from pathlib import Path
import numpy as np


In [6]:
def get_sig(df, gene):
    """Return significance value or None."""
    vals = df.loc[df["Gene(s)"] == gene, "Significance"]
    if len(vals) == 0:
        return None
    val = vals.iloc[0]
    return None if (isinstance(val, float) and np.isnan(val)) else val

severity_rank = {
    "Pathogenic": 6,
    "Likely pathogenic": 5,
    'Pathogenic/likely pathogenic/pathogenic, low penetrance': 4,
    "Risk factor": 4,
    "Association": 3,
    "Uncertain significance": 2,
    "Uncertain risk allele": 2,
    "Not provided": 1,
    None: 0,
}

def score_gene(info):
    return sum(severity_rank.get(v, 0) for v in info.values())


def build_overlap_info(gene_sets, snp_dfs, labels):
    overlap = set(gene_sets[0])
    for g in gene_sets[1:]:
        overlap &= set(g)

    overlap_info = {}

    for gene in overlap:
        entry = {}
        for label, df in zip(labels, snp_dfs):
            entry[label] = get_sig(df, gene)
        entry["score"] = score_gene(entry)
        overlap_info[gene] = entry

    return overlap_info

def rank_genes(overlap_info):
    return sorted(overlap_info.items(), key=lambda x: x[1]["score"], reverse=True)

In [9]:
son_snp_df = pd.read_csv(Path("data/hg002-son-snp.csv"), delimiter=",")
father_snp_df = pd.read_csv(Path("data/hg003-father-snp.csv"), delimiter=",")
mother_snp_df = pd.read_csv(Path("data/hg004-mother-snp.csv"), delimiter=",")

son_genes = list(son_snp_df["Gene(s)"])
father_genes = list(father_snp_df["Gene(s)"])
mother_genes = list(mother_snp_df["Gene(s)"])


overlap_3 = build_overlap_info(
    [son_genes, mother_genes, father_genes],
    [son_snp_df, mother_snp_df, father_snp_df],
    ["son", "mother", "father"]
)

overlap_mother_father = build_overlap_info(
    [mother_genes, father_genes],
    [mother_snp_df, father_snp_df],
    ["mother", "father"]
)

overlap_mother_son = build_overlap_info(
    [mother_genes, son_genes],
    [mother_snp_df, son_snp_df],
    ["mother", "son"]
)

overlap_father_son = build_overlap_info(
    [father_genes, son_genes],
    [father_snp_df, son_snp_df],
    ["father", "son"]
)


ranked_3 = rank_genes(overlap_3)
ranked_mother_father = rank_genes(overlap_mother_father)
ranked_mother_son = rank_genes(overlap_mother_son)
ranked_father_son = rank_genes(overlap_father_son)

print("\nTop overlapping genes (all members):")
for g, info in ranked_3:
    print(g, info)

print("\nTop overlapping genes (mother-father):")
for g, info in ranked_mother_father:
    print(g, info)

print("\nTop overlapping genes (mother-son):")
for g, info in ranked_mother_son:
    print(g, info)

print("\nTop overlapping genes (father-son):")
for g, info in ranked_father_son:
    print(g, info)


Top overlapping genes (all members):
PERM1 {'son': 'Pathogenic', 'mother': 'Pathogenic', 'father': 'Pathogenic', 'score': 18}
GBP1 {'son': 'Likely pathogenic', 'mother': 'Likely pathogenic', 'father': 'Likely pathogenic', 'score': 15}
CHI3L1 {'son': 'Risk factor', 'mother': 'Risk factor', 'father': 'Risk factor', 'score': 12}
FCGR2B {'son': 'Risk factor', 'mother': 'Risk factor', 'father': 'Risk factor', 'score': 12}
ECE1 {'son': 'Risk factor', 'mother': 'Risk factor', 'father': 'Risk factor', 'score': 12}
PADI2 {'son': 'Association', 'mother': 'Association', 'father': 'Association', 'score': 9}
BGLAP, PAQR6 {'son': 'Association', 'mother': 'Association', 'father': 'Association', 'score': 9}
CYP4Z1, CYP4A22-AS1 {'son': 'Association', 'mother': 'Association', 'father': 'Association', 'score': 9}
MGST3 {'son': 'Association', 'mother': 'Association', 'father': 'Association', 'score': 9}
TNFRSF1B {'son': 'Uncertain significance', 'mother': 'Uncertain significance', 'father': 'Uncertain si