In [1]:
import os
import dask.dataframe as dd
from dask.distributed import Client
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)


pd_data_dir = "/home/djl34/lab_pd/data"
aso_data_dir = "/home/djl34/lab_pd/aso/data"
KL_data_dir = "/home/djl34/lab_pd/kl/data"
scratch_dir = "/n/scratch3/users/d/djl34"

## get a list of genes that are AD and lof

In [2]:
filename = aso_data_dir + "/Clingen-Gene-Disease-Summary-2023-06-24.csv"
df = pd.read_csv(filename)

filename = aso_data_dir + "/Clingen-Dosage-Sensitivity-2023-06-25.csv"
df_ds = pd.read_csv(filename)
df_ds_haploinsufficient = df_ds[df_ds["HAPLOINSUFFICIENCY"].isin(['Sufficient Evidence for Haploinsufficiency', 'Emerging Evidence for Haploinsufficiency'])]

In [3]:
df_ds["HAPLOINSUFFICIENCY"].unique()

array(['Gene Associated with Autosomal Recessive Phenotype',
       'Sufficient Evidence for Haploinsufficiency',
       'No Evidence for Haploinsufficiency',
       'Little Evidence for Haploinsufficiency',
       'Emerging Evidence for Haploinsufficiency',
       'Dosage Sensitivity Unlikely'], dtype=object)

In [4]:
df["CLASSIFICATION"].unique()

array(['Disputed', 'Definitive', 'Moderate', 'Limited',
       'No Known Disease Relationship', 'Strong', 'Refuted'], dtype=object)

In [7]:
df_AD = df[df["MOI"] == "AD"]

df_AD_strong = df_AD[df_AD["CLASSIFICATION"].isin(["Strong", "Definitive"])]

In [8]:
df_AD_strong

Unnamed: 0,GENE SYMBOL,GENE ID (HGNC),DISEASE LABEL,DISEASE ID (MONDO),MOI,SOP,CLASSIFICATION,ONLINE REPORT,CLASSIFICATION DATE,GCEP
25,ACOX1,HGNC:119,Mitchell syndrome,MONDO:0030073,AD,SOP9,Strong,https://search.clinicalgenome.org/kb/gene-vali...,2022-12-29T17:00:00.000Z,Peroxisomal Disorders
31,ACTA2,HGNC:130,familial thoracic aortic aneurysm and aortic d...,MONDO:0019625,AD,SOP4,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2016-09-27T00:00:00,Heritable Thoracic Aortic Aneurysm and Dissection
33,ACTB,HGNC:132,Baraitser-Winter cerebrofrontofacial syndrome,MONDO:0017579,AD,SOP8,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2021-10-26T16:58:35.977Z,Brain Malformations
34,ACTC1,HGNC:143,hypertrophic cardiomyopathy,MONDO:0005045,AD,SOP8,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2021-06-23T20:16:16.296Z,Hypertrophic Cardiomyopathy
37,ACTG1,HGNC:144,Baraitser-winter syndrome 2,MONDO:0013812,AD,SOP6,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2019-01-07T17:00:00.000Z,Hearing Loss
...,...,...,...,...,...,...,...,...,...,...
2160,YARS1,HGNC:12840,Charcot-Marie-Tooth disease,MONDO:0015626,AD,SOP7,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2020-04-28T16:00:00.000Z,Charcot-Marie-Tooth
2166,ZEB2,HGNC:14881,Mowat-Wilson syndrome,MONDO:0009341,AD,SOP5,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2018-05-24T16:00:00.000Z,Intellectual Disability and Autism
2167,ZMIZ1,HGNC:16493,complex neurodevelopmental disorder,MONDO:0100038,AD,SOP9,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2022-07-05T02:00:00.000Z,Intellectual Disability and Autism
2171,ZNF292,HGNC:18410,complex neurodevelopmental disorder,MONDO:0100038,AD,SOP7,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2020-05-06T16:00:00.000Z,Intellectual Disability and Autism


In [9]:
df_AD_strong_lof = df_AD_strong.merge(df_ds_haploinsufficient, on = "GENE SYMBOL", how = "inner")

In [10]:
gene_list = list(df_AD_strong_lof["GENE SYMBOL"].unique())

In [11]:
all_ad_gene_list = list(df_AD_strong["GENE SYMBOL"].unique())

In [12]:
set(all_ad_gene_list) - set(gene_list)

{'ACOX1',
 'ACTA2',
 'ACTB',
 'ACTC1',
 'ACTG1',
 'ACTN1',
 'AKT3',
 'ALK',
 'ALPL',
 'ANK1',
 'ANKRD26',
 'ANXA11',
 'ASXL2',
 'ATL1',
 'ATP1A2',
 'ATP1A3',
 'BRAF',
 'BRD4',
 'BRSK2',
 'BSCL2',
 'C9orf72',
 'CALM1',
 'CALM2',
 'CALM3',
 'CAPN5',
 'CARD11',
 'CAV1',
 'CAV3',
 'CBL',
 'CDC42',
 'CDK4',
 'CEBPA',
 'CHD3',
 'CHMP2B',
 'CHRNA4',
 'CHRNB2',
 'CNOT3',
 'COCH',
 'COL11A2',
 'COL5A2',
 'COL6A1',
 'COL6A2',
 'COL6A3',
 'COMP',
 'CSNK2B',
 'CTLA4',
 'CXCR4',
 'DCC',
 'DEPDC5',
 'DES',
 'DIAPH1',
 'DNAJB11',
 'DNAJB6',
 'DNM1',
 'DNM1L',
 'DNM2',
 'DNMT1',
 'DSPP',
 'EEF1A2',
 'EFEMP1',
 'EGFR',
 'EIF2S3',
 'ELANE',
 'ELOVL4',
 'EPCAM',
 'EPHB4',
 'EZH2',
 'F2',
 'F5',
 'FGFR2',
 'FGFR3',
 'FLT4',
 'FOXJ1',
 'FOXP2',
 'FUS',
 'GABRA1',
 'GABRB3',
 'GABRG2',
 'GANAB',
 'GARS1',
 'GBA',
 'GCK',
 'GDF2',
 'GFI1B',
 'GJB3',
 'GJB6',
 'GLUD1',
 'GNAI1',
 'GNAO1',
 'GP1BA',
 'GREM1',
 'GRHL2',
 'GRIA2',
 'GRIK2',
 'GRIN1',
 'GRIN2D',
 'GRN',
 'GSDME',
 'GUCY2D',
 'HBB',
 'HEPACAM',
 '