jupyter nbconvert 6_1_gcta_uni.ipynb --to script

for i in {00..101};do python 6_1_gcta_uni.py $i;done

for i in {00..10};do python 4_association.py $i;done


https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6292650/

In [1]:
import os
import sys

import logging
import re
import pathlib

import datetime

import numpy as np
import pandas as pd
from scipy.stats import pearsonr

import matplotlib.pyplot as plt

from pyplink import PyPlink

from basic_tools import *

# load plink, aa and check integrity

In [2]:
plink_KCHIP_HLA_AA_SNP_1000G=PyPlink(plink_KCHIP_HLA_AA_SNP_1000G_path)
plink_KCHIP_HLA_AA_SNP_1000G_fam=plink_KCHIP_HLA_AA_SNP_1000G.get_fam().astype({'fid':str,'iid':str}).rename(columns={'fid':'FID','iid':'IID'})
plink_KCHIP_HLA_AA_SNP_1000G_bim=plink_KCHIP_HLA_AA_SNP_1000G.get_bim()

In [3]:
grm_path='data/genotype/4_merge/KCHIP_HLA_AA_SNP_1000G.grm'

In [4]:
#final_plink_aa_grm_path

# load phenotype and check integrity

In [5]:
phenotypes=pd.read_csv(pheno_all_file_path,sep='\t')
'  '.join(phenotypes.columns)

'ID  age  sex  asthma  chronic_bronchitis  allergic_disease  stomach_cancer  liver_cancer  colorectal_cancer  breast_cancer  cervical_cancer  lung_cancer  thyroid_cancer  prostate_cancer  bladder_cancer  colon_polyps  angina_pectoris_myocardial_infarction  hypertension  stroke  transient_ischemic_attacks  acute_liver_disease  diabetes  hyperlipidemia  thyroid_disease  arthritis  rheumatoid_arthritis  osteoporosis  parkinson  pwi_total  chronic_gastritis_stomach_ulcer  duodenal_ulcer  urinary_tract_infection  height  hip_circumference  waist_circumference  weight  systolic_blood_pressure  diastolic_blood_pressure  pulse  hysterectomy  menarche_onset_age  menopause_age  menopause_cycle  ovarian_extraction  child_delivery_number  big_baby  small_baby  child_present  child_dead  hematocrit  hemoglobin  platelet  rbc  wbc  mcv  mch  mchc  total_bilirubin  bun  creatinine  blood_in_urine  glucose_in_urine  alt  ast  gtp  total_cholesterol  hdl_cholesterol  hba1c  freet4  tsh  triglyceride  a

In [6]:
phenotypes=phenotypes.set_index('ID').loc[plink_KCHIP_HLA_AA_SNP_1000G_fam['IID']]
phenotypes.shape

(125673, 101)

In [7]:
#np.all(phenotypes['ALP'].isnull())

In [8]:
assert (phenotypes.index!=plink_KCHIP_HLA_AA_SNP_1000G_fam['IID']).sum()==0

In [9]:
binary_continuous_traits=phenotypes.columns.difference(['age','sex','cohort'])
binary_continuous_traits,len(binary_continuous_traits)

(Index(['FEV_over_FVC_predicted', 'FEV_predicted', 'FVC_predicted',
        'MVV_predicted', 'acute_liver_disease', 'albumin', 'allergic_disease',
        'alt', 'angina_pectoris_myocardial_infarction', 'arthritis', 'ast',
        'asthma', 'benign_prostatic_hyperplasia', 'big_baby', 'bladder_cancer',
        'blood_in_urine', 'bmi', 'bone_density_distal_sos',
        'bone_density_midshaft_sos', 'breast_cancer', 'bun', 'cataract',
        'cervical_cancer', 'child_dead', 'child_delivery_number',
        'child_present', 'cholelithiasis_cholecystitis', 'chronic_bronchitis',
        'chronic_gastritis_stomach_ulcer', 'colon_polyps', 'colorectal_cancer',
        'copd', 'creatinine', 'diabetes', 'diastolic_blood_pressure',
        'duodenal_ulcer', 'fatty_liver', 'fracture', 'freet4', 'glaucoma',
        'glucose_in_blood', 'glucose_in_urine', 'gout', 'grip_strength', 'gtp',
        'hba1c', 'hdl_cholesterol', 'heart_failure', 'height', 'hematocrit',
        'hemoglobin', 'hip_circumfere

# parse parameter

In [10]:
if 'ipykernel' in sys.argv[0]:
    ipykernel=True
    phenotype_name='diabetes'
    #phenotype_name='height'
else:
    ipykernel=False
    phenotype_name=sys.argv[1]
if phenotype_name.isdigit():
    phenotype_name=int(phenotype_name)
    phenotype_name=binary_continuous_traits[phenotype_name]      

In [12]:
pheno=pd.read_csv(data_out_pheno_path+phenotype_name+'.phe',sep='\t',header=None,names=['FID','IID','pheno'])
phenotype_type='binary' if len(pheno['pheno'][pheno['pheno']!=-9].value_counts())<3 else 'continuous'
phenotype_type

'binary'

In [32]:
for i in binary_continuous_traits:
    print(i,os.path.exists(data_out_gcta_path+i+'.HEreg'))

FEV_over_FVC_predicted True
FEV_predicted True
FVC_predicted True
MVV_predicted True
acute_liver_disease True
albumin True
allergic_disease True
alt True
angina_pectoris_myocardial_infarction True
arthritis True
ast True
asthma True
benign_prostatic_hyperplasia True
big_baby True
bladder_cancer True
blood_in_urine True
bmi True
bone_density_distal_sos True
bone_density_midshaft_sos True
breast_cancer True
bun True
cataract True
cervical_cancer True
child_dead True
child_delivery_number True
child_present True
cholelithiasis_cholecystitis True
chronic_bronchitis True
chronic_gastritis_stomach_ulcer True
colon_polyps True
colorectal_cancer True
copd True
creatinine True
diabetes True
diastolic_blood_pressure True
duodenal_ulcer True
fatty_liver True
fracture True
freet4 False
glaucoma True
glucose_in_blood True
glucose_in_urine True
gout True
grip_strength True
gtp True
hba1c True
hdl_cholesterol True
heart_failure True
height True
hematocrit True
hemoglobin True
hip_circumference True
h

In [14]:
log = logging.getLogger('logger')
log.setLevel(logging.DEBUG)

log_file_name=datetime.datetime.now().strftime('%Y%m%d_%H%M%S')+'.log'
log_file_path=data_out_gcta_path+log_file_name
fileHandler = logging.FileHandler(log_file_path)
streamHandler = logging.StreamHandler()

formatter = logging.Formatter(' %(asctime)s [%(levelname)s] %(lineno)d > %(message)s')
fileHandler.setFormatter(formatter)
streamHandler.setFormatter(formatter)

log.addHandler(fileHandler)
log.addHandler(streamHandler)

In [15]:
log.info("phenotype_name: {}, phenotype_type:{}".format(phenotype_name,phenotype_type))

 2020-03-15 01:32:27,931 [INFO] 1 > phenotype_name: diabetes, phenotype_type:binary


In [16]:
pheno[pheno['pheno']!=-9].to_csv(data_out_gcta_path+phenotype_name+'.phe',sep='\t',index=None,header=None)

In [29]:
if phenotype_type=='binary':
    with open(data_out_pheno_path+phenotype_name+'.phe'+'.prev','r') as f:
        prev=float(f.read())
    print('prev',prev)

prev 0.07085054068893079


In [30]:
log.info("#########################################  Run GCTA  #########################################")
#Run omnibus association test
command='gcta64 --HEreg --grm {} --pheno {} --out {} --thread-num 40'.format(grm_path,
                                                                             data_out_gcta_path+phenotype_name+'.phe',
                                                                             data_out_gcta_path+phenotype_name,                                         
                                                                            )
log.info(command)
stdout,stderr=run_subprocess(command,dry=False)
log.info(stdout)
log.error(stderr)    


 2020-03-15 01:59:48,115 [INFO] 1 > #########################################  Run GCTA  #########################################
 2020-03-15 01:59:48,116 [INFO] 8 > gcta64 --HEreg --grm data/genotype/4_merge/KCHIP_HLA_AA_SNP_1000G.grm --pheno data/out_gcta/diabetes.phe --prevalence 0.07085054068893079 --out data/out_gcta/diabetes --thread-num 40


------RUN-----
gcta64 --HEreg --grm data/genotype/4_merge/KCHIP_HLA_AA_SNP_1000G.grm --pheno data/out_gcta/diabetes.phe --prevalence 0.07085054068893079 --out data/out_gcta/diabetes --thread-num 40


KeyboardInterrupt: 

0.09964747356051704

In [48]:
#pheno0=pd.read_csv(data_out_assoc_path+phenotype_list[0]+'/'+'phenotype.phe',header=None,sep='\t',names=['FID','IID','pheno0'])
#pheno1=pd.read_csv(data_out_assoc_path+phenotype_list[1]+'/'+'phenotype.phe',header=None,sep='\t',names=['FID','IID','pheno1']);pheno0['pheno1']=pheno1['pheno1']
#pheno2=pd.read_csv(data_out_assoc_path+phenotype_list[1]+'/'+'phenotype.phe',header=None,sep='\t',names=['FID','IID','pheno2']);pheno0['pheno2']=pheno2['pheno2']

command='gcta64 --HEreg-bivar 1 2 --grm {} --pheno {} --out {} --thread-num 40'.format(final_plink_aa_grm_path,
                                                                            'temp.phe',
                                                                             'testout2'  
                                                                            )
command

gcta64 --HEreg-bivar 1 2 --grm data/genotype/4_merge/grm --pheno temp2.phe --out data/out_assoc/height/HE2 --thread-num 40

(pheno0['pheno0']!=-9).sum(),(pheno0['pheno1']!=-9).sum(),(pheno0['pheno2']!=-9).sum()

In [29]:
#!cat data/out_gcta/ALP.phe