In [None]:
import tqdm
import datetime
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.metrics import RocCurveDisplay
from statsmodels.tools.sm_exceptions import PerfectSeparationError
from sklearn.preprocessing import StandardScaler#, MinMaxScaler
scaler = StandardScaler()
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import RidgeClassifier
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter
from scipy.stats import t

In [None]:
date = '20240228'
OUTCOME = 'T1D_STRICT'
SEED = 4
covariates = ['PC'+str(i) for i in range(1, 11)] + ['sex', 'age']
eps_sig = ['E4_GRAVES_STRICT','D3_ANAEMIA_B12_DEF','E4_HYTHY_AI_STRICT',
           'K11_COELIAC','M13_SJOGREN','M13_RHEUMA','T1D_STRICT']
eps_dict = {
    'D3_AIHA_OTHER':'Autoimmune hemolytic anemia',
    'D3_ALLERGPURPURA':'Allergic purpura',
    'D3_ANAEMIA_B12_DEF':'Vitamin B12 deficiency anaemia',
    'D3_ITP':'Idiopathic thrombocytopenic purpura',
    'CHIRBIL_PRIM':'Primary biliary cholangitis',
    'K11_COELIAC':'Coeliac disease',
    'K11_IBD':'Inflammatory bowel disease',
    'N14_IGA_NEPHROPATHY':'IgA nephropathy',
    'M13_MCTD':'Mixed connective tissue disease',
    'M13_RHEUMA':'Rheumatoid arthritis',
    'M13_SJOGREN':'Sjögren syndrome',
    'M13_SYSTSLCE':'Systemic sclerosis',
    'M13_WEGENER':'Wegener granulomatosis',
    'SLE_FG':'Systemic lupus erythematosus',
    'G6_GUILBAR':'Guillain-Barre syndrome',
    'G6_MS':'Multiple Sclerosis',
    'G6_MYASTHENIA':'Myasthenia gravis',
    'L12_ALOPECAREATA':'Alopecia areata',
    'L12_PSORIASIS':'Psoriasis',
    'L12_VITILIGO':'Vitiligo',
    'E4_ADDISON':'Adrenocortical insufficiency',
    'E4_GRAVES_STRICT':'Autoimmune hyperthyroidism',
    'E4_HYTHY_AI_STRICT':'Autoimmune hypothyroidism',
    'T1D_STRICT':'Type 1 diabetes'
}

In [None]:
genes = ['A*01:01', 'A*01:02', 'A*02:01', 'A*02:02', 'A*02:03', 'A*02:05', 'A*02:06', 'A*02:07', 'A*02:17', 'A*03:01', 'A*11:01',
 'A*23:01', 'A*24:02', 'A*25:01', 'A*26:01', 'A*29:01', 'A*29:02', 'A*30:01', 'A*30:02', 'A*31:01', 'A*32:01', 'A*33:01',
 'A*33:03', 'A*33:05', 'A*68:01', 'A*69:01', 'A*68167', 'C*01:02', 'C*02:02', 'C*03:02', 'C*03:03', 'C*03:04', 'C*04:01',
 'C*04:06', 'C*05:01', 'C*06:02', 'C*07:01', 'C*07:02', 'C*07:04', 'C*08:02', 'C*12:02', 'C*12:03', 'C*14:02', 'C*15:02',
 'C*15:05', 'C*16:01', 'C*16:02', 'C*17:01', 'C*17:03', 'C*03327', 'B*07:01', 'B*07:02', 'B*08:01', 'B*13:01', 'B*13:02',
 'B*14:01', 'B*14:02', 'B*15:01', 'B*15:16', 'B*15:17', 'B*18:01', 'B*27:02', 'B*27:05', 'B*35:01', 'B*35:02', 'B*35:03',
 'B*35:08', 'B*37:01', 'B*38:01', 'B*39:01', 'B*39:06', 'B*39:24', 'B*40:01', 'B*40:02', 'B*41:01', 'B*41:02', 'B*44:02',
 'B*44:03', 'B*44:27', 'B*45:01', 'B*46:01', 'B*47:01', 'B*49:01', 'B*50:01', 'B*51:01', 'B*52:01', 'B*55:01', 'B*56:01',
 'B*57:01', 'B*58:01', 'DRB3*01:01', 'DRB4*01:01', 'DRB5*01:01', 'DRB3*02:02', 'DRB4*01:03', 'DRB5*01:02', 'DRB3*03:01',
 'DRB4*01:03N', 'DRB5*02:02', 'DRB1*01:01', 'DRB1*01:02', 'DRB1*01:03', 'DRB1*03:01', 'DRB1*04:01', 'DRB1*04:02', 'DRB1*04:03',
 'DRB1*04:04', 'DRB1*04:05', 'DRB1*04:07', 'DRB1*04:08', 'DRB1*07:01', 'DRB1*07:03', 'DRB1*08:01', 'DRB1*08:02', 'DRB1*08:03',
 'DRB1*09:01', 'DRB1*10:01', 'DRB1*11:01', 'DRB1*11:03', 'DRB1*11:04', 'DRB1*12:01', 'DRB1*13:01', 'DRB1*13:02', 'DRB1*13:03',
 'DRB1*13:05', 'DRB1*13:32', 'DRB1*14:01', 'DRB1*14:02', 'DRB1*14:54', 'DRB1*15:01', 'DRB1*15:02', 'DRB1*16:01', 'DQA1*03:02',
 'DQA1*01:01', 'DQA1*03:03', 'DQA1*01:02', 'DQA1*04:01', 'DQA1*01:03', 'DQA1*04:02', 'DQA1*01:04', 'DQA1*05:01', 'DQA1*01:05',
 'DQA1*05:03', 'DQA1*02:01', 'DQA1*05:05', 'DQA1*03:01', 'DQA1*06:01', 'DQB1*02:01', 'DQB1*02:02', 'DQB1*03:01', 'DQB1*03:02',
 'DQB1*03:03', 'DQB1*03:04', 'DQB1*03:05', 'DQB1*04:02', 'DQB1*05:01', 'DQB1*05:02', 'DQB1*05:03', 'DQB1*06:01', 'DQB1*06:02',
 'DQB1*06:03', 'DQB1*06:04', 'DQB1*06:09', 'DPB1*01:01', 'DPB1*02:01', 'DPB1*02:02', 'DPB1*03:01', 'DPB1*04:01', 'DPB1*04:02',
 'DPB1*04.02', 'DPB1*05:01', 'DPB1*06:01', 'DPB1*09:01', 'DPB1*10:01', 'DPB1*11:01', 'DPB1*13:01', 'DPB1*14:01', 'DPB1*15:01',
 'DPB1*16:01', 'DPB1*17:01', 'DPB1*19:01', 'DPB1*20:01', 'DPB1*23:01', 'DPB1*25:01', 'DPB1*31:01', 'DPB1*34:01', 'DPB1*105:01']

## Data preparation

In [None]:
# load all the individuals
events = pd.read_csv('finngen_R11/phenotype_1.0/data/finngen_R11_minimum_extended_1.0.txt.gz', sep='\t')
print('at beginning', len(events))
events = events[(events.movedabroad.isna())&(events.regionofbirth != 9999)]
events = events[['FINNGENID', 'COHORT']].rename(columns={'FINNGENID':'finngen_id', 'COHORT':'source'})
print('now', len(events))

In [None]:
hla_df = pd.read_csv('hla_R11.csv')
phenos = pd.read_csv('phenos.csv')
hla_df = phenos.rename(columns={'FINNGENID':'finngen_id'}).merge(hla_df, 'inner')
hla_df = hla_df.merge(events, 'inner')

In [None]:
for i in eps_dict.keys():
    hla_df[i] = np.select([(hla_df[i+'_onset'].isna()), (~hla_df[i+'_onset'].isna())], [0, 1])

hla_df['sex'] = np.select([(hla_df.SEX == 'female'), (hla_df.SEX == 'male')], [1, 0])

In [None]:
mop = pd.read_csv('finngen_R11/phenotype_1.0/data/finngen_omop/finngen_R11_person.csv', sep='\t')
mop = mop[['person_source_value', 'birth_datetime']].rename(columns={'person_source_value':'finngen_id'})
hla_df = hla_df.merge(mop, 'left')
hla_df['birth_yr'] = hla_df['birth_datetime'].str.split('-').str[0].astype(float)

In [None]:
# load family pedigree data
fam = pd.read_csv('finngen_R11/kinship_1.0/data/finngen_R11_pedigree.fam', sep='\t', header=None)
fam.columns = ['family_id', 'finngen_id', 'father_id', 'mother_id', 'sex', 'phenotype']
fam_pa = fam[fam.mother_id.isin(hla_df.finngen_id)&fam.father_id.isin(hla_df.finngen_id)]

test_df = hla_df[hla_df.finngen_id.isin(list(set(fam_pa.finngen_id.tolist()+fam_pa.father_id.tolist()+\
                                                 fam_pa.mother_id.tolist())))]
train_df = hla_df[~hla_df.finngen_id.isin(test_df.finngen_id)]
print('train_df', len(train_df), 'test_df', len(test_df))

In [None]:
t1d_contr_df = test_df[(test_df.source == 'THL BIOBANK T1D')&(test_df.T1D_STRICT == 0)]
test_df1 = test_df[~test_df.finngen_id.isin(t1d_contr_df.finngen_id)]

## Individual PGS
### HLA PGS

In [None]:
eps_selected = ['T1D_STRICT', 'E4_HYTHY_AI_STRICT', 'M13_RHEUMA', 'SLE_FG',
                'K11_COELIAC', 'L12_PSORIASIS', 'K11_IBD', 'G6_MS']
full_df = hla_df[['finngen_id']+covariates+genes+eps_selected]
full_df.sex = full_df.sex.astype(int)

for ep in eps_selected:
    try:
        prs = pd.read_csv('/home/ivm/Desktop/t1d/sandbox_prs_r11/'+ep+'.no_regions.sscore', sep='\t')
        prs = prs.sort_values('SCORE1_AVG')
        prs = prs[['IID', 'SCORE1_AVG']].rename(columns={'IID':'finngen_id', 'SCORE1_AVG':'non_'+ep})
        full_df = full_df.merge(prs, 'left')
    except:
        print(ep)

In [None]:
for ep in eps_selected+['M13_MCTD', 'E4_ADDISON']:
    full_df['hla_'+ep] = full_df[genes].to_numpy() @ weight_df_dict[ep]

full_df = full_df[['finngen_id']+covariates+eps_selected+['non_'+ep for ep in eps_selected]+\
    ['hla_'+ep for ep in eps_selected]]

In [None]:
full_res_pcor = pd.read_csv('/home/ivm/Desktop/t1d/full_res_pcor_'+date+'.csv')

In [None]:
for ep in eps_selected:
    print(ep, full_df['non_'+ep].corr(full_df['hla_'+ep]))

In [None]:
eps_selected_weights1 = {}
eps_selected_weights2 = {}

for i, row in full_res_pcor.iterrows():
    ep = row.endpoint
    a = round(row.pcor_hla1/(row.pcor_hla1+row.pcor_non1), 2)
    b = round(row.pcor_non1/(row.pcor_hla1+row.pcor_non1), 2)
    eps_selected_weights1[ep] = (a, b)
    non2 = round(np.abs(row.pcor_non1)/(np.abs(row.pcor_hla1)+np.abs(row.pcor_non1)), 2)
    if non2 < 0:
        eps_selected_weights2[ep] = (1, 0)
    else:
        hla2 = round(np.abs(row.pcor_hla1)/(np.abs(row.pcor_hla1)+np.abs(row.pcor_non1)), 2)
        eps_selected_weights2[ep] = (hla2, non2)

In [None]:
eps_selected_weights3 = {}
for k,v in eps_selected_weights1.items():
    if k == 'T1D_STRICT':
        eps_selected_weights3['k'] = (0.67, 0.33)
    else:
        a = v[0]*0.67/(v[0]*0.67+v[1]*0.33)
        b = v[1]*0.67/(v[0]*0.67+v[1]*0.33)
        eps_selected_weights3['k'] = (round(a, 2), round(b, 2))

In [None]:
for ep in eps_selected:
    try:
        prs = pd.read_csv('/home/ivm/Desktop/t1d/sandbox_prs_r11/'+ep+'.sscore', sep='\t')
        prs = prs.sort_values('SCORE1_AVG')
        prs = prs[['IID', 'SCORE1_AVG']].rename(columns={'IID':'finngen_id', 'SCORE1_AVG':'prscs_'+ep})
        full_df = full_df.merge(prs, 'left')
    except:
        print(ep)

In [None]:
eps_excluded = [i for i in eps_dict.keys() if i not in eps_selected]
full_df = full_df.merge(hla_df[['finngen_id']+eps_excluded], 'left')

In [None]:
ep1, ep2 = 'M13_MCTD', 'E4_ADDISON'
temp_df = full_df[['finngen_id']+covariates+[i+ep for i in ['', 'non_', 'hla_', 'prscs_']
                                             for ep in eps_selected]+eps_excluded+['hla_'+ep1,'hla_'+ep2]]
df = fam_pa.iloc[:, :-2].merge(temp_df, 'left')

temp_df = full_df[['finngen_id']+[i+ep for i in ['', 'non_', 'hla_', 'prscs_']
                                  for ep in eps_selected]+eps_excluded+['hla_'+ep1,'hla_'+ep2]]
temp_df.columns = ['father_id']+[i+ep for i in ['fa_', 'fa_non_', 'fa_hla_', 'fa_prscs_']
                                 for ep in eps_selected]+eps_excluded+['fa_hla_'+ep1,'fa_hla_'+ep2]
df = df.merge(temp_df, 'left')

temp_df = full_df[['finngen_id']+[i+ep for i in ['', 'non_', 'hla_', 'prscs_']
                                  for ep in eps_selected]+eps_excluded+['hla_'+ep1,'hla_'+ep2]]
temp_df.columns = ['father_id']+[i+ep for i in ['fa_', 'fa_non_', 'fa_hla_', 'fa_prscs_']
                                 for ep in eps_selected]+eps_excluded+['fa_hla_'+ep1,'fa_hla_'+ep2]
df = df.merge(temp_df, 'left')

In [None]:
what = 'improved_'
mita = 'full_'
shenme = 'prscs'
for ep in eps_selected:
    # add mid-parent non-HLA PGSs
    df['pa_non_'+ep] = (df['mo_non_' + ep] + df['fa_non_' + ep]) / 2
    # add mid-parent HLA PGSs
    df['pa_hla_'+ep] = (df['mo_hla_' + ep] + df['fa_hla_' + ep]) / 2
    # add delta between mid-parent PGSs and child's PGS
    df['non_delta_'+ep] = df['non_' + ep] - df['pa_non_'+ep]
    df['hla_delta_'+ep] = df['hla_' + ep] - df['pa_hla_'+ep]

    # add improved full pgs for all
    hla_w = eps_selected_weights3[ep][0]
    non_w = eps_selected_weights3[ep][1]
    df[what+ep] = df['hla_'+ep]*hla_w + df['non_'+ep]*non_w
    df['mo_'+what+ep] = df['mo_hla_'+ep]*hla_w + df['mo_non_'+ep]*non_w
    df['fa_'+what+ep] = df['fa_hla_'+ep]*hla_w + df['fa_non_'+ep]*non_w
    df['pa_'+what+ep] = (df['mo_'+what+ep] + df['fa_'+what+ep]) / 2
    df[what+'delta_'+ep] = df[what+ep] - df['pa_'+what+ep]

    # add basic full pgs for all
    hla_w = eps_selected_weights1[ep][0]
    non_w = eps_selected_weights1[ep][1]
    df[mita+ep] = df['hla_'+ep]*hla_w + df['non_'+ep]*non_w
    df['mo_'+mita+ep] = df['mo_hla_'+ep]*hla_w + df['mo_non_'+ep]*non_w
    df['fa_'+mita+ep] = df['fa_hla_'+ep]*hla_w + df['fa_non_'+ep]*non_w
    df['pa_'+mita+ep] = (df['mo_'+mita+ep] + df['fa_'+mita+ep]) / 2
    df[mita+'delta_'+ep] = df[mita+ep] - df['pa_'+mita+ep]

    # add prs-cs for all
    df['pa_'+shenme+ep] = (df['mo_'+shenme+ep] + df['fa_'+shenme+ep]) / 2
    df[shenme+'delta_'+ep] = df[shenme+ep] - df['pa_'+shenme+ep]

for ep in ['M13_MCTD', 'E4_ADDISON']:
    df['pa_hla_'+ep] = (df['mo_hla_'+ep] + df['fa_hla_'+ep]) / 2
    df['hla_delta_'+ep] = df['hla_'+ep] - df['pa_hla_'+ep]

In [None]:
df = df[~df.pa_non_T1D_STRICT.isna()]
family = df.drop(columns='family_id')
family.to_csv('family_20240212.csv', index=None)
# len(family) = 12563

### Mendelian sampling effect

In [None]:
colors = ['crimson', 'silver']
width = 0.12
ylabel = 'Child - med-parent PGS'

In [None]:
def getStatsForPlotting1(data, region, endpoint):
    col_delta = region+'_delta_'+endpoint
    col_pa = 'pa_'+region+'_'+endpoint
    n = len(data)

    delta_deviation = data[col_delta]/data[col_pa].std()
    mean_deviation = delta_deviation.mean()
    sd_deviation = delta_deviation.std()

    t_stats = mean_deviation/(sd_deviation/n**.5)
    pvalue = t.sf(np.abs(t_stats), n-1)*2

    if pvalue < 0.01:
        p = f'P = {pvalue:.2e}'
    else:
        p = 'P = '+str(round(pvalue, 2))
    boolean = True if pvalue <= 0.05 else False
    return [mean_deviation, mean_deviation-1.96*sd_deviation/np.sqrt(n), mean_deviation+1.96*sd_deviation/np.sqrt(n),
            p, boolean, pvalue]

In [None]:
for ep in eps_selected:
    # remove all children with affacted parent(s)
    t1d_prs_2 = family[(family['mo_'+ep] == 0)&(family['fa_'+ep] == 0)]
    # num of children 2115 -> 2115
    # in which 772 affacted children, 1304 unaffacted children
    t1d_prs_2 = t1d_prs_2[['finngen_id', 'father_id', 'mother_id', 'T1D_STRICT',
                           'non_delta_'+ep, 'hla_delta_'+ep, 'pa_non_'+ep, 'pa_hla_'+ep]]

    af_df = t1d_prs_2[t1d_prs_2.T1D_STRICT == 1]
    uf_df = t1d_prs_2[t1d_prs_2.T1D_STRICT == 0]
    uf_df = uf_df[uf_df.father_id.isin(af_df.father_id)&uf_df.mother_id.isin(af_df.mother_id)]

    cols = af_df.columns.tolist()+['outcome','group']
    df = pd.DataFrame(columns=cols)
    group_n = 0
    for i,row in tqdm.tqdm(af_df.iterrows()):
        temp_df = uf_df[(uf_df.father_id == row.father_id)&(uf_df.mother_id == row.mother_id)]
        if len(temp_df) > 0:
            group_n += 1
            row['outcome'] = 1
            row['group'] = group_n
            df = df.append(row, ignore_index=True)
            temp_df['outcome'] = 0
            temp_df['group'] = group_n
            df = pd.concat([df,temp_df])
    print('affacted siblings',len(df[df.outcome == 1]))
    print('unaffacted siblings',len(df[df.outcome == 0]))

    plt.figure(figsize=(3, 4))
    plt.box(False)
    plt.grid()
    regions = ['hla','non']
    data = df
    outcome=ep

    for i in [0,1]:
        af = getStatsForPlotting1(data[data.outcome == 1], regions[i], ep)
        print(regions[i], ep, af)
        plt.plot((i-width, i-width), (af[1], af[2]), color=colors[0])
        plt.plot(i-width, af[0], 's', color=colors[0])
        plt.annotate(af[3], (i-width+.05, af[2]), size=9, color='black')
        # plt.annotate(af[3], (i-width-.3, af[2]*1.1), size=9, color='black')

        un = getStatsForPlotting1(data[data.outcome == 0], regions[i], ep)
        print(regions[i], ep, un)
        plt.plot((i+width, i+width), (un[1], un[2]), color=colors[1])
        plt.plot(i+width, un[0], 's', color=colors[1])
        plt.annotate(un[3], (i+width+.05, un[2]), size=9, color='black')

    plt.xticks(range(2), ['HLA','non-HLA'])#, rotation=90)
    plt.ylabel(ylabel, size=12)
    plt.xlabel(eps_dict[outcome], size=12)
    plt.xlim([-0.6,1.6])
    plt.axhline(y=0.0, color='black', linestyle='--', linewidth=1)

    plt.grid()
    plt.show()
