In [1]:
import os
import dask.dataframe as dd
from dask.distributed import Client
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)


pd_data_dir = "/home/djl34/lab_pd/data"
aso_data_dir = "/home/djl34/lab_pd/aso/data"
KL_data_dir = "/home/djl34/lab_pd/kl/data"
scratch_dir = "/n/scratch3/users/d/djl34"

## get a list of genes that are AD and lof

In [2]:
filename = aso_data_dir + "/Clingen-Gene-Disease-Summary-2023-06-24.csv"
df = pd.read_csv(filename)

filename = aso_data_dir + "/Clingen-Dosage-Sensitivity-2023-06-25.csv"
df_ds = pd.read_csv(filename)
df_ds_haploinsufficient = df_ds[df_ds["HAPLOINSUFFICIENCY"].isin(['Sufficient Evidence for Haploinsufficiency', 'Emerging Evidence for Haploinsufficiency'])]

In [3]:
df_ds["HAPLOINSUFFICIENCY"].unique()

array(['Gene Associated with Autosomal Recessive Phenotype',
       'Sufficient Evidence for Haploinsufficiency',
       'No Evidence for Haploinsufficiency',
       'Little Evidence for Haploinsufficiency',
       'Emerging Evidence for Haploinsufficiency',
       'Dosage Sensitivity Unlikely'], dtype=object)

In [4]:
df["CLASSIFICATION"].unique()

array(['Disputed', 'Definitive', 'Moderate', 'Limited',
       'No Known Disease Relationship', 'Strong', 'Refuted'], dtype=object)

In [5]:
df_AD = df[df["MOI"] == "AD"]

df_AD_strong = df_AD[df_AD["CLASSIFICATION"].isin(["Strong", "Definitive"])]

df_AD_strong_lof = df_AD_strong.merge(df_ds_haploinsufficient, on = "GENE SYMBOL", how = "inner")

In [6]:
gene_list = list(df_AD_strong_lof["GENE SYMBOL"].unique())

In [7]:
len(gene_list)

191

In [8]:
filename = pd_data_dir + "/biomart/ENSG_Genename_syn.tsv"
df_ensg = pd.read_csv(filename, sep = "\t")
df_ensg = df_ensg.rename({"Gene name": "GENE SYMBOL", "Gene stable ID" : "Gene"}, axis = 1)
df_ensg = df_ensg[["GENE SYMBOL", "Gene", "Chromosome/scaffold name"]].drop_duplicates()
df_ensg = df_ensg[df_ensg["Chromosome/scaffold name"].str.contains("CHR") == False]
df_ensg = df_ensg.rename({"Gene name": "GENE SYMBOL", "Gene stable ID" : "Gene"}, axis = 1)

df_AD_strong_lof_ensg = df_AD_strong_lof.merge(df_ensg, on = "GENE SYMBOL", how = "left")

ensg_gene_list = list(df_AD_strong_lof_ensg["Gene"].unique())

## combine with lof data

In [9]:
filename = aso_data_dir + "/fake_transcript_variants_v4_lof_HC_filtered_unint.tsv"
df_lof = pd.read_csv(filename, sep = "\t")

In [8]:
df_lof.columns

Index(['Gene', 'Feature', 'Chrom', 'Pos', 'Allele_ref', 'Allele',
       'Consequence', 'UNIPROT_ISOFORM', 'gnomAD_AF', 'gnomAD_NFE_AF',
       'LoF_flags', 'CANONICAL', 'mu_nonhyper', 'mu_hypermut', 'mu_quality',
       'context', 'mean', 'median', 'over_1', 'over_5', 'over_10', 'over_15',
       'over_20', 'over_25', 'over_30', 'over_50', 'over_100', 'filter', 'AN',
       'AN_nfe', 'AC', 'AC_nfe', 'controls_AN', 'controls_AC', 'non_cancer_AN',
       'non_cancer_AC', 'BaseQRankSum', 'ClippingRankSum', 'DP', 'FS',
       'InbreedingCoeff', 'MQ', 'MQRankSum', 'QD', 'ReadPosRankSum', 'SOR',
       'VQSLOD', 'VQSR_NEGATIVE_TRAIN_SITE', 'VQSR_POSITIVE_TRAIN_SITE',
       'VQSR_culprit', 'allele_type', 'dp_hist_all_n_larger',
       'dp_hist_alt_n_larger', 'has_star', 'lcr', 'n_alt_alleles', 'pab_max',
       'rf_label', 'rf_negative_label', 'rf_positive_label',
       'rf_tp_probability_label', 'interp_dist', 'site_in_genome_bottle',
       'ref_alt', 'Carlson_rate', 'tri', 'methylation_

In [10]:
df_lof = df_lof[["Gene", "Chrom", "Pos", "Allele_ref", "Allele", "Consequence", "LoF_flags", "filter", "mu", "AN", "AC_unint", "AN_nfe", "AC_nfe_unint"]]

In [13]:
df_AD_strong_lof_ensg = df_AD_strong_lof.merge(df_ensg, on = "GENE SYMBOL", how = "left")

In [14]:
df_AD_strong_lof_ensg["Chromosome/scaffold name"].unique()

array(['12', '20', '1', '4', '16', '5', '2', '6', '18', '11', '7', '10',
       '3', '17', '13', '9', '15', '8', '14', '22', '21', '19'],
      dtype=object)

In [15]:
df_AD_strong_lof_ensg = df_AD_strong_lof_ensg[["GENE SYMBOL", "Gene", "DISEASE LABEL", "MOI", "CLASSIFICATION", "GCEP", "HAPLOINSUFFICIENCY"]]

In [16]:
df_AD_strong_lof_ensg[["GENE SYMBOL", "Gene"]].drop_duplicates()

Unnamed: 0,GENE SYMBOL,Gene
0,ACVRL1,ENSG00000139567
1,ADNP,ENSG00000101126
2,AHDC1,ENSG00000126705
3,ANK2,ENSG00000145362
4,ANKRD11,ENSG00000167522
...,...,...
217,VHL,ENSG00000134086
218,WAC,ENSG00000095787
219,WT1,ENSG00000184937
221,ZEB2,ENSG00000169554


In [17]:
df_AD_strong_lof_ensg[["GENE SYMBOL", "DISEASE LABEL", "HAPLOINSUFFICIENCY"]].to_csv("../data/genes_AD_lof_strong.tsv",
                                                                                    sep = "\t", index = None)

In [18]:
df_AD_strong_lof_ensg_variants = df_AD_strong_lof_ensg[["GENE SYMBOL", "Gene"]].drop_duplicates().merge(df_lof, on = "Gene", how = "left")

In [18]:
# df_AD_strong_lof_ensg_variants.to_csv("../data/variants_AD_lof_strong.tsv", sep = "\t", index = None)

## filter by splice acceptor variants

In [28]:
df_AD_strong_lof_ensg_variants = df_AD_strong_lof_ensg_variants[df_AD_strong_lof_ensg_variants["Chrom"].isna() == False]
df_AD_strong_lof_ensg_variants["Chrom"] = df_AD_strong_lof_ensg_variants["Chrom"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_AD_strong_lof_ensg_variants["Chrom"] = df_AD_strong_lof_ensg_variants["Chrom"].astype(int)


In [19]:
df_variants = df_AD_strong_lof_ensg_variants[df_AD_strong_lof_ensg_variants["Chrom"].isna() == False]

In [32]:
df_variants = df_AD_strong_lof_ensg_variants[df_AD_strong_lof_ensg_variants["Chrom"] == 22]

In [26]:
df_AD_strong_lof_ensg_variants["Chrom"].unique()

array([12., 20.,  1.,  4., 16.,  5.,  2.,  6., 18., 11.,  7., 10.,  3.,
       17., 13.,  9., 15.,  8., 14., 22., 21., 19., nan])

In [20]:
df_variants["Consequence"].unique()

array(['stop_gained', 'splice_acceptor_variant', 'splice_donor_variant',
       'stop_gained&splice_region_variant'], dtype=object)

In [34]:
df_variants = df_variants[(df_variants["Consequence"].str.contains("splice_acceptor_variant")) | (df_variants["Consequence"].str.contains("splice_donor_variant"))]

In [35]:
df_variants

Unnamed: 0,GENE SYMBOL,Gene,Chrom,Pos,Allele_ref,Allele,Consequence,LoF_flags,filter,mu,AN,AC_unint,AN_nfe,AC_nfe_unint
19789,CHEK2,ENSG00000183765,22,28687987.0,C,A,splice_acceptor_variant,.,,0.041,,,,
19790,CHEK2,ENSG00000183765,22,28687987.0,C,T,splice_acceptor_variant,.,,0.105,,,,
19791,CHEK2,ENSG00000183765,22,28687987.0,C,G,splice_acceptor_variant,.,,0.041,,,,
19792,CHEK2,ENSG00000183765,22,28687988.0,T,G,splice_acceptor_variant,.,,0.030,,,,
19793,CHEK2,ENSG00000183765,22,28687988.0,T,C,splice_acceptor_variant,.,,0.094,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76699,TNRC6B,ENSG00000100354,22,40322852.0,A,C,splice_acceptor_variant,.,,0.030,,,,
76700,TNRC6B,ENSG00000100354,22,40322852.0,A,G,splice_acceptor_variant,.,,0.139,,,,
76701,TNRC6B,ENSG00000100354,22,40322852.0,A,T,splice_acceptor_variant,.,,0.030,,,,
76702,TNRC6B,ENSG00000100354,22,40322853.0,G,C,splice_acceptor_variant,.,,0.073,,,,


In [21]:
df_variants.to_csv("../data/variants_splice_acceptor_donor_AD_lof_strong.tsv", sep = "\t", index = None)

In [22]:
df_variants

Unnamed: 0,GENE SYMBOL,Gene,Chrom,Pos,Allele_ref,Allele,Consequence,LoF_flags,filter,mu,AN,AC_unint,AN_nfe,AC_nfe_unint
3,ACVRL1,ENSG00000139567,12.0,51913097.0,A,G,splice_acceptor_variant,NAGNAG_SITE,,0.139,,,,
4,ACVRL1,ENSG00000139567,12.0,51913097.0,A,T,splice_acceptor_variant,NAGNAG_SITE,,0.020,,,,
5,ACVRL1,ENSG00000139567,12.0,51913098.0,G,C,splice_acceptor_variant,NAGNAG_SITE,RF,0.020,232218.0,0.0,105398.0,0.0
6,ACVRL1,ENSG00000139567,12.0,51913098.0,G,A,splice_acceptor_variant,NAGNAG_SITE,RF,0.094,232218.0,1.0,105398.0,1.0
23,ACVRL1,ENSG00000139567,12.0,51913351.0,G,C,splice_donor_variant,.,,0.051,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96717,ZNF462,ENSG00000148143,9.0,107009670.0,T,G,splice_donor_variant,.,,0.020,,,,
96718,ZNF462,ENSG00000148143,9.0,107010821.0,A,T,splice_acceptor_variant,.,,0.020,,,,
96719,ZNF462,ENSG00000148143,9.0,107010821.0,A,C,splice_acceptor_variant,.,,0.030,,,,
96720,ZNF462,ENSG00000148143,9.0,107010822.0,G,C,splice_acceptor_variant,.,,0.051,,,,


In [22]:
for x in list(df_AD_strong_lof_ensg["DISEASE LABEL"].unique()):
    print(x, end = ", ")

telangiectasia, hereditary hemorrhagic, type 2, ADNP-related multiple congenital anomalies - intellectual disability - autism spectrum disorder, AHDC1-related intellectual disability - obstructive sleep apnea - mild dysmorphism syndrome, complex neurodevelopmental disorder, KBG syndrome, syndromic intellectual disability, gastric adenocarcinoma and proximal polyposis of the stomach, classic or attenuated familial adenomatous polyposis, hypercholesterolemia, autosomal dominant, type B, Coffin-Siris syndrome, Bohring-Opitz syndrome, hereditary breast carcinoma, myofibrillar myopathy, dilated cardiomyopathy, BAP1-related tumor predisposition syndrome, Dias-Logan syndrome, juvenile polyposis syndrome, pulmonary arterial hypertension, breast-ovarian cancer, familial, susceptibility to, 1, breast-ovarian cancer, familial, susceptibility to, 2, familial ovarian cancer, hyperparathyroidism 2 with jaw tumors, hereditary diffuse gastric adenocarcinoma, multiple endocrine neoplasia type 4, melano

In [23]:
df_variants[["GENE SYMBOL", "DISEASE LABEL", "HAPLOINSUFFICIENCY"]].drop_duplicates()

KeyError: "['DISEASE LABEL', 'HAPLOINSUFFICIENCY'] not in index"

In [23]:
len(df_variants["GENE SYMBOL"].unique())

186

In [113]:
len(df_variants)

15981

In [24]:
df_variants["mu_pergen"] = df_variants["mu"] * 1.015 * 10 **-7

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_variants["mu_pergen"] = df_variants["mu"] * 1.015 * 10 **-7


In [25]:
df_variants["mu_pergen"].sum()

0.00022304493049999993

In [26]:
1/df_variants["mu_pergen"].sum()

4483.401607731229

In [27]:
3.66*10**6 / 4483.401607731229


816.3444456299998

In [82]:
df_variants["LoF_flags"].unique()

array(['NAGNAG_SITE', '.', 'NON_CAN_SPLICE'], dtype=object)

In [83]:
df_variants

Unnamed: 0,GENE SYMBOL,Gene,DISEASE LABEL,MOI,CLASSIFICATION,GCEP,HAPLOINSUFFICIENCY,Chrom,Pos,Allele_ref,Allele,Consequence,LoF_flags,filter,mu,AN,AC_unint,AN_nfe,AC_nfe_unint,mu_pergen
3,ACVRL1,ENSG00000139567,"telangiectasia, hereditary hemorrhagic, type 2",AD,Definitive,Hemostasis Thrombosis,Sufficient Evidence for Haploinsufficiency,12.0,51913097.0,A,G,splice_acceptor_variant,NAGNAG_SITE,,0.139,,,,,1.410850e-08
4,ACVRL1,ENSG00000139567,"telangiectasia, hereditary hemorrhagic, type 2",AD,Definitive,Hemostasis Thrombosis,Sufficient Evidence for Haploinsufficiency,12.0,51913097.0,A,T,splice_acceptor_variant,NAGNAG_SITE,,0.020,,,,,2.030000e-09
5,ACVRL1,ENSG00000139567,"telangiectasia, hereditary hemorrhagic, type 2",AD,Definitive,Hemostasis Thrombosis,Sufficient Evidence for Haploinsufficiency,12.0,51913098.0,G,C,splice_acceptor_variant,NAGNAG_SITE,RF,0.020,232218.0,0.0,105398.0,0.0,2.030000e-09
6,ACVRL1,ENSG00000139567,"telangiectasia, hereditary hemorrhagic, type 2",AD,Definitive,Hemostasis Thrombosis,Sufficient Evidence for Haploinsufficiency,12.0,51913098.0,G,A,splice_acceptor_variant,NAGNAG_SITE,RF,0.094,232218.0,1.0,105398.0,1.0,9.541000e-09
28,ACVRL1,ENSG00000139567,"telangiectasia, hereditary hemorrhagic, type 2",AD,Definitive,Hemostasis Thrombosis,Sufficient Evidence for Haploinsufficiency,12.0,51913557.0,A,T,splice_acceptor_variant,.,,0.020,,,,,2.030000e-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111537,ZNF462,ENSG00000148143,weiss-kruszka syndrome,AD,Definitive,Syndromic Disorders,Sufficient Evidence for Haploinsufficiency,9.0,107009544.0,G,T,splice_acceptor_variant,.,,0.020,,,,,2.030000e-09
111547,ZNF462,ENSG00000148143,weiss-kruszka syndrome,AD,Definitive,Syndromic Disorders,Sufficient Evidence for Haploinsufficiency,9.0,107010821.0,A,T,splice_acceptor_variant,.,,0.020,,,,,2.030000e-09
111548,ZNF462,ENSG00000148143,weiss-kruszka syndrome,AD,Definitive,Syndromic Disorders,Sufficient Evidence for Haploinsufficiency,9.0,107010821.0,A,C,splice_acceptor_variant,.,,0.030,,,,,3.045000e-09
111549,ZNF462,ENSG00000148143,weiss-kruszka syndrome,AD,Definitive,Syndromic Disorders,Sufficient Evidence for Haploinsufficiency,9.0,107010822.0,G,C,splice_acceptor_variant,.,,0.051,,,,,5.176500e-09
