In [1]:
import os
import sys
import shutil

import logging
import re
import pathlib

import datetime

import numpy as np
import pandas as pd
from scipy.stats import pearsonr

import matplotlib.pyplot as plt

from pyplink import PyPlink
import seaborn as sns

import statsmodels.api as sm

import matplotlib.patches as patches
import matplotlib

from basic_tools import *

"""

jupyter nbconvert 5_association.ipynb --to script
python 5_association.py albumin 2 0

"""

'\n\njupyter nbconvert 5_association.ipynb --to script\npython 5_association.py albumin 2 0\n\n'

In [2]:
plink_KCHIP_HLA_AA_SNP_1000G=PyPlink(plink_KCHIP_HLA_AA_SNP_1000G_path)
plink_KCHIP_HLA_AA_SNP_1000G_fam=plink_KCHIP_HLA_AA_SNP_1000G.get_fam().astype({'fid':str,'iid':str}).rename(columns={'fid':'FID','iid':'IID'})
plink_KCHIP_HLA_AA_SNP_1000G_bim=plink_KCHIP_HLA_AA_SNP_1000G.get_bim()

In [3]:
phenotype_info=pd.read_csv('data/phenotype_pubname.tsv',sep='\t',index_col=0)

In [18]:
phenotypes=pd.read_csv(pheno_all_file_path,sep='\t')
phenotypes=phenotypes.set_index('ID').loc[plink_KCHIP_HLA_AA_SNP_1000G_fam['IID']]

binary_continuous_traits=phenotypes.columns.difference(['age','sex','cohort','diabetes'])
print(len(binary_continuous_traits))

97


In [19]:
log = logging.getLogger('logger')
log.setLevel(logging.DEBUG)

log_file_name=datetime.datetime.now().strftime('%Y%m%d_%H%M%S')+'.log'
log_file_path=data_out_assoc_phenotype_path+log_file_name
fileHandler = logging.FileHandler(log_file_path)
streamHandler = logging.StreamHandler()

formatter = logging.Formatter('%(message)s')
fileHandler.setFormatter(formatter)
streamHandler.setFormatter(formatter)

log.addHandler(fileHandler)
log.addHandler(streamHandler)

NameError: name 'data_out_assoc_phenotype_path' is not defined

In [21]:
log.info_head=lambda x: log.info('-'*int((100-len(x))/2)+x+'-'*int((100-len(x))/2))

# Result Table generating

In [4]:
SNP_DB=pd.read_csv('data/mart_export_SNP.txt',sep='\t')

In [5]:
SNP_DB['Variant alleles set']=SNP_DB['Variant alleles'].apply(lambda x: set(x.split('/')).difference(['-']))

In [6]:
rsid_mapper=plink_KCHIP_HLA_AA_SNP_1000G_bim.copy()
rsid_mapper=rsid_mapper[(~rsid_mapper.index.str.contains('HLA'))
                        |~(rsid_mapper.index.str.contains('AA_'))]

In [14]:
def pos_a1_a2_to_rsid(row):
    
    SNP_DB_match=SNP_DB[(SNP_DB['Chromosome/scaffold position start (bp)']<=row['pos'])&
                        (SNP_DB['Chromosome/scaffold position end (bp)']>=row['pos'])]
    if len(SNP_DB_match)==0:
        return row.name
    
    SNP_DB_match_match=SNP_DB_match[SNP_DB_match['Variant alleles set'].apply(lambda x: set([row['a1'],row['a2']]).issubset(x))]
    if len(SNP_DB_match_match)==0:
        return row.name
    elif len(SNP_DB_match_match)==1:
        return SNP_DB_match_match.iloc[0]['Variant name']

    SNP_DB_match_match_match=SNP_DB_match_match[(SNP_DB_match_match['Chromosome/scaffold position start (bp)']<=row['pos'])&
                            (SNP_DB_match_match['Chromosome/scaffold position end (bp)']>=row['pos'])]        
    if len(SNP_DB_match_match_match)==0:
        return SNP_DB_match_match.iloc[0]['Variant name']
    elif len(SNP_DB_match_match_match)==1:
        return SNP_DB_match_match_match.iloc[0]['Variant name']
    else:
        SNP_DB_match_match_match.iloc[0]['Variant name']

In [15]:
rsid_mapper['marker_name_pub']=rsid_mapper.apply(pos_a1_a2_to_rsid,axis=1)

In [54]:
def marker_name_to_marker_name_pub(marker_name):
    if marker_name[:4]=='HLA_':
        return marker_name.replace('_','-')
    elif marker_name[:3]=='AA_':
        marker_name_split=marker_name.split('_')
        return 'HLA-{} position {}'.format(marker_name_split[1],marker_name_split[2])
    else:
        return rsid_mapper.loc[marker_name]['marker_name_pub']

In [25]:
phenotype_info=pd.read_csv('data/phenotype_pubname.tsv',sep='\t',index_col=0)

In [86]:
for phenotype_name in binary_continuous_traits:
    data_out_assoc_phenotype_path=data_out_assoc_path+phenotype_name+'/'

    pheotype_name_publication=phenotype_info.loc[phenotype_name]['phenotype_name_publication'].replace(' ','_').replace('/','_')
    data_out_pub_phenotype_path=data_out_pub_path+pheotype_name_publication+'/'
    pathlib.Path(data_out_pub_phenotype_path).mkdir(parents=True, exist_ok=True)
    
    pheno=pd.read_csv(data_out_pheno_path+phenotype_name+'.phe',sep='\t',names=['FID','IID','pheno'])
    phenotype_type='binary' if len(pheno['pheno'][pheno['pheno']!=-9].value_counts())<3 else 'continuous'
    phenotype_type    
    
    for step_idx_sub in range(1,100):
        log.info_head("phenotype_name: {}, phenotype_type:{} , Step : {} ".format(phenotype_name,phenotype_type,step_idx_sub))
        #print(step_idx_sub)
        if os.path.exists(data_out_assoc_phenotype_path+'step_{:02d}.cond.stop'.format(step_idx_sub)):
            print('meets end',step_idx_sub)
            break
            
        result_merge=pd.read_csv(data_out_assoc_phenotype_path+'step_{:02d}.merge.result.tsv'.format(step_idx_sub),sep='\t',index_col=0)

        result_merge=result_merge.drop(columns=['phenotype_name'])        
        result_merge['marker_name_pub']=result_merge['marker_name'].apply(marker_name_to_marker_name_pub)
        
        result_merge[['marker_name', 'marker_name_pub','samples(case/control)', 'step', 'condition', 
                      'note', 'term', 'POS', 'A1', 'A2', 'A1_freq_case', 'A1_freq_control',
                       'multi_allele', 'nobs', 'Z', 'coef', 'std', 'chisq', 'df', 'P']].to_csv(data_out_pub_phenotype_path+'step_{:02d}.result.tsv'.format(step_idx_sub),sep='\t',index=None)

        shutil.copy(data_out_assoc_phenotype_path+'step_{:02d}.merge.manhattan.png'.format(step_idx_sub),
                    data_out_pub_phenotype_path+'step_{:02d}.manhattan.png'.format(step_idx_sub))        

meets end 2
meets end 3
meets end 3
meets end 2
meets end 2
meets end 4
meets end 3
meets end 4
meets end 2
meets end 2
meets end 4
meets end 2
meets end 2
meets end 2
meets end 2
meets end 3
meets end 4
meets end 2
meets end 2
meets end 2
meets end 3
meets end 2
meets end 2
meets end 2
meets end 2
meets end 2
meets end 2
meets end 2
meets end 2
meets end 2
meets end 2
meets end 2
meets end 5
meets end 3
meets end 2
meets end 2
meets end 2
meets end 2
meets end 2
meets end 2
meets end 2
meets end 3
meets end 2
meets end 3
meets end 5
meets end 2
meets end 7
meets end 5
meets end 3
meets end 5
meets end 3
meets end 3
meets end 3
meets end 2
meets end 2
meets end 2
meets end 2
meets end 3
meets end 2
meets end 3
meets end 2
meets end 3
meets end 2
meets end 2
meets end 2
meets end 2
meets end 2
meets end 2
meets end 2
meets end 7
meets end 2
meets end 3
meets end 2
meets end 3
meets end 2
meets end 6
meets end 2
meets end 2
meets end 2
meets end 2
meets end 2
meets end 2
meets end 2
meet

In [4]:
result_all=pd.read_csv('data/result_concat_3_top_gene_annot.tsv',sep='\t')

In [5]:
result_all_pub=result_all.copy()

In [6]:
result_all_pub['No. signals']=result_all_pub['phenotype_name'].map(lambda x: result_all[(result_all['P_significant']==1) & (result_all['phenotype_name']==x)].shape[0])

In [8]:
result_all_pub=result_all_pub[(result_all_pub['P_significant']==1)]

In [9]:
result_all_pub['phenotype_category']=phenotype_info.loc[result_all_pub['phenotype_name']]['phenotype_category'].values
result_all_pub['phenotype_type']=phenotype_info.loc[result_all_pub['phenotype_name']]['phenotype_type'].values
result_all_pub['phenotype_name_publication']=phenotype_info.loc[result_all_pub['phenotype_name']]['phenotype_name_publication'].values

In [11]:
result_all_pub[['A1','A2']]

Unnamed: 0,A1,A2
1,C,A
3,G,GGAGCCC
7,,
8,C,T
10,C,T
12,C,T
13,A,C
17,,
18,p,a
24,A,G
