In [68]:
import pandas as pd
pd.set_option('display.max_columns', None)
import dask.dataframe as dd

In [69]:
import numpy as np
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [70]:
data_dir = "/net/home/dlee/brca1/data/"

## load shet file

In [71]:
shet_dir = "/net/home/dlee/brca1/data/lof/shet/shet/deterministic/"

header_col = ["Gene", "mean_shet", "lower_shet", "upper_shet"]

df_r = dd.read_csv(shet_dir + "mean_shet_CI_gnomAD_v2_Roulette_slice_*.tsv", sep = "\t", comment = "#", 
                   names = header_col)
df_r = df_r.compute()
df_r.sort_values("mean_shet", inplace = True, ascending = False)

# df_c = dd.read_csv(shet_dir + "mean_shet_CI_gnomAD_v2_Carlson_slice_*.tsv", sep = "\t", comment = "#", 
#                    names = header_col)
# df_c = df_c.compute()
# df_c.sort_values("mean_shet", inplace = True, ascending = False)


df_g = dd.read_csv(shet_dir + "mean_shet_CI_gnomAD_v2_gnomAD_slice_*.tsv", sep = "\t", comment = "#", 
                   names = header_col)
df_g = df_g.compute()
df_g.sort_values("mean_shet", inplace = True, ascending = False)

## load LOEUF scores

In [82]:
gene_names_df = pd.read_csv(data_dir + "biomart/ENSP_ENST_ENSG_names.csv")
gene_names_df = gene_names_df.rename({"Gene name": "gene"}, axis = 1)
gene_names_df = gene_names_df[["gene", "Gene stable ID"]]
gene_names_df.drop_duplicates(inplace = True)

In [83]:
loeuf = pd.read_csv(data_dir + "lof/supplementary_dataset_11_full_constraint_metrics.tsv", sep = "\t")

loeuf = loeuf[loeuf["canonical"] == True]

loeuf = loeuf.merge(gene_names_df, on = "gene", how = "left")
loeuf.rename({"Gene stable ID": "Gene"}, axis = 1, inplace = True)

## merge LOEUF and shet

In [84]:
# df_c.columns = [x + "_C" for x in df_c.columns]
# df_c.rename({"Gene_C": "Gene"}, axis = 1, inplace = True)

df_pergene = df_r.merge(df_g, on = "Gene", suffixes=('_R', '_G'))
# df_pergene = df_pergene.merge(df_c, on = "Gene")

In [85]:
df_pergene = df_pergene.merge(loeuf[["Gene", "oe_lof_upper_rank"]], how = "inner", on = "Gene")

df_pergene = df_pergene[df_pergene["oe_lof_upper_rank"].isna() == False]

#make decile
for i in ["R", "G"]:
    df_pergene["shet_decile_" + i] = 9 - pd.qcut(df_pergene["mean_shet_" + i ], 10, labels = False)

df_pergene["loeuf_decile"] = pd.qcut(df_pergene["oe_lof_upper_rank"], 10, labels = False)

## load AD AR genes

In [86]:
## load AD AR genes

adar_dir = "/net/home/dlee/brca1/data/clinical/AD_AR_fromDan_revised/"

ad_hq = pd.read_csv(adar_dir +"AD/CGD_AD_HQ_ENSG.tsv", sep = "\t")
ad = pd.read_csv(adar_dir +"AD/CGD_AD_2020_ENSG.tsv", sep = "\t")

ad["type"] = "AD"
ad_hq["type"] = "AD_HQ"

## make plot

In [87]:
df_pergene_merged_output = df_pergene.merge(ad_hq[["Gene", "type"]], how = "left", on = "Gene")

disease_set = df_pergene_merged_output[df_pergene_merged_output["type"] =="AD_HQ"]
print(len(disease_set))


for i in ["shet_decile_R", "shet_decile_G",  "loeuf_decile"]:
    print(disease_set.groupby(i).size()/len(disease_set))

227
shet_decile_R
0    0.414097
1    0.189427
2    0.092511
3    0.101322
4    0.048458
5    0.061674
6    0.030837
7    0.030837
8    0.013216
9    0.017621
dtype: float64
shet_decile_G
0    0.405286
1    0.180617
2    0.110132
3    0.092511
4    0.057269
5    0.052863
6    0.039648
7    0.030837
8    0.013216
9    0.017621
dtype: float64
loeuf_decile
0    0.475771
1    0.189427
2    0.101322
3    0.074890
4    0.057269
5    0.030837
6    0.017621
7    0.022026
8    0.022026
9    0.008811
dtype: float64


In [88]:
y_true = np.where(df_pergene_merged_output["type"] == "AD_HQ", 1, 0)

print(roc_auc_score(y_true, -1* df_pergene_merged_output["shet_decile_R"]))
print(roc_auc_score(y_true, -1* df_pergene_merged_output["shet_decile_G"]))
print(roc_auc_score(y_true, -1* df_pergene_merged_output["loeuf_decile"]))

0.7694656079696851
0.7649930744316349
0.8030097582820424


In [89]:
df_pergene_merged_output = df_pergene.merge(ad[["Gene", "type"]], how = "left", on = "Gene")

disease_set = df_pergene_merged_output[df_pergene_merged_output["type"] =="AD"]
print(len(disease_set))


for i in ["shet_decile_R", "shet_decile_G", "loeuf_decile"]:
    print(disease_set.groupby(i).size()/len(disease_set))

1097
shet_decile_R
0    0.285324
1    0.164084
2    0.111212
3    0.100273
4    0.071103
5    0.065634
6    0.060164
7    0.044667
8    0.048314
9    0.049225
dtype: float64
shet_decile_G
0    0.286235
1    0.160438
2    0.113036
3    0.097539
4    0.074749
5    0.065634
6    0.060164
7    0.048314
8    0.047402
9    0.046490
dtype: float64
loeuf_decile
0    0.282589
1    0.196901
2    0.128532
3    0.081130
4    0.072926
5    0.064722
6    0.050137
7    0.051960
8    0.030994
9    0.040109
dtype: float64


In [90]:
y_true = np.where(df_pergene_merged_output["type"] == "AD", 1, 0)

print(roc_auc_score(y_true, -1* df_pergene_merged_output["shet_decile_R"]))
print(roc_auc_score(y_true, -1* df_pergene_merged_output["shet_decile_G"]))
print(roc_auc_score(y_true, -1* df_pergene_merged_output["loeuf_decile"]))

0.6830326306634625
0.68303276149619
0.7064332087478685


In [50]:
disease_set[(disease_set["loeuf_decile"] == 0) & (disease_set["shet_decile_R"] != 0)]

Unnamed: 0,Gene,mean_shet_R,lower_shet_R,upper_shet_R,mean_shet_G,lower_shet_G,upper_shet_G,mean_shet_C,lower_shet_C,upper_shet_C,oe_lof_upper_rank,shet_decile_R,shet_decile_G,shet_decile_C,loeuf_decile,type
1360,ENSG00000145362,0.247175,0.15200,0.40000,0.266199,0.164,0.43100,0.306314,0.18900,0.49500,716.0,1,1,0,0,AD
1376,ENSG00000106991,0.244848,0.08300,0.66400,0.274293,0.093,0.74400,0.266476,0.09000,0.72500,5462.0,1,1,1,0,AD
1388,ENSG00000183454,0.242584,0.10400,0.54900,0.229057,0.097,0.52200,0.232357,0.09900,0.52900,3256.0,1,1,1,0,AD
1399,ENSG00000184640,0.240963,0.06700,0.75600,0.282009,0.078,0.88000,0.277364,0.07700,0.87000,6064.0,1,0,1,0,AD
1406,ENSG00000167548,0.239753,0.15600,0.37000,0.274877,0.178,0.42400,0.265643,0.17200,0.40900,657.0,1,1,1,0,AD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6393,ENSG00000083168,0.032516,0.02600,0.04100,0.034853,0.028,0.04400,0.036479,0.03000,0.04600,172.0,4,4,4,0,AD
6435,ENSG00000006283,0.032094,0.02600,0.04100,0.035995,0.029,0.04600,0.038351,0.03000,0.05000,4184.0,4,4,4,0,AD
8419,ENSG00000153922,0.018304,0.01600,0.02200,0.020613,0.018,0.02500,0.023483,0.02000,0.02900,2299.0,6,6,6,0,AD
8777,ENSG00000128917,0.016718,0.01300,0.02300,0.019005,0.015,0.02600,0.013967,0.01100,0.01900,3280.0,6,6,7,0,AD
