jupyter nbconvert 6_1_gcta_uni.ipynb --to script

for i in {00..101};do python 6_1_gcta_uni.py $i;done

for i in {00..10};do python 4_association.py $i;done


https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6292650/

In [1]:
import os
import sys

import logging
import re
import pathlib

import datetime

import numpy as np
import pandas as pd
from scipy.stats import pearsonr

import matplotlib.pyplot as plt

from pyplink import PyPlink

from basic_tools import *

# load plink, aa and check integrity

In [2]:
plink_KCHIP_HLA_AA_SNP_1000G=PyPlink(plink_KCHIP_HLA_AA_SNP_1000G_path)
plink_KCHIP_HLA_AA_SNP_1000G_fam=plink_KCHIP_HLA_AA_SNP_1000G.get_fam().astype({'fid':str,'iid':str}).rename(columns={'fid':'FID','iid':'IID'})
plink_KCHIP_HLA_AA_SNP_1000G_bim=plink_KCHIP_HLA_AA_SNP_1000G.get_bim()

In [3]:
grm_path='data/genotype/4_merge/KCHIP_HLA_AA_SNP_1000G.grm'

In [4]:
#final_plink_aa_grm_path

# load phenotype and check integrity

In [5]:
phenotypes=pd.read_csv(pheno_all_file_path,sep='\t')
'  '.join(phenotypes.columns)

'ID  age  sex  asthma  chronic_bronchitis  allergic_disease  stomach_cancer  liver_cancer  colorectal_cancer  breast_cancer  cervical_cancer  lung_cancer  thyroid_cancer  prostate_cancer  bladder_cancer  colon_polyps  angina_pectoris_myocardial_infarction  hypertension  stroke  transient_ischemic_attacks  acute_liver_disease  diabetes  t2_diabetes  hyperlipidemia  thyroid_disease  arthritis  rheumatoid_arthritis  osteoporosis  parkinson  pwi_total  chronic_gastritis_stomach_ulcer  duodenal_ulcer  urinary_tract_infection  height  hip_circumference  waist_circumference  weight  systolic_blood_pressure  diastolic_blood_pressure  pulse  hysterectomy  menarche_onset_age  menopause_age  menopause_cycle  ovarian_extraction  child_delivery_number  big_baby  small_baby  child_present  child_dead  hematocrit  hemoglobin  platelet  rbc  wbc  mcv  mch  mchc  total_bilirubin  bun  creatinine  blood_in_urine  glucose_in_urine  alt  ast  gtp  total_cholesterol  hdl_cholesterol  hba1c  freet4  tsh  tr

In [6]:
phenotypes=phenotypes.set_index('ID').loc[plink_KCHIP_HLA_AA_SNP_1000G_fam['IID']]
phenotypes.shape

(125673, 101)

In [7]:
#np.all(phenotypes['ALP'].isnull())

In [8]:
assert (phenotypes.index!=plink_KCHIP_HLA_AA_SNP_1000G_fam['IID']).sum()==0

In [9]:
binary_continuous_traits=phenotypes.columns.difference(['age','sex','cohort','diabetes'])
binary_continuous_traits,len(binary_continuous_traits)

(Index(['FEV_over_FVC_predicted', 'FEV_predicted', 'FVC_predicted',
        'MVV_predicted', 'acute_liver_disease', 'albumin', 'allergic_disease',
        'alt', 'angina_pectoris_myocardial_infarction', 'arthritis', 'ast',
        'asthma', 'benign_prostatic_hyperplasia', 'big_baby', 'bladder_cancer',
        'blood_in_urine', 'bmi', 'bone_density_distal_sos',
        'bone_density_midshaft_sos', 'breast_cancer', 'bun', 'cataract',
        'cervical_cancer', 'child_dead', 'child_delivery_number',
        'child_present', 'cholelithiasis_cholecystitis', 'chronic_bronchitis',
        'chronic_gastritis_stomach_ulcer', 'colon_polyps', 'colorectal_cancer',
        'copd', 'creatinine', 'diastolic_blood_pressure', 'duodenal_ulcer',
        'fatty_liver', 'fracture', 'freet4', 'glaucoma', 'glucose_in_urine',
        'gout', 'grip_strength', 'gtp', 'hba1c', 'hdl_cholesterol',
        'heart_failure', 'height', 'hematocrit', 'hemoglobin',
        'hip_circumference', 'hscrp', 'hyperlipidemia',

# parse parameter

In [17]:
if 'ipykernel' in sys.argv[0]:
    ipykernel=True
    phenotype_name='diabetes'
    #phenotype_name='height'
else:
    ipykernel=False
    phenotype_name=sys.argv[1]
    
if phenotype_name.isdigit():
    phenotype_name=int(phenotype_name)
    phenotype_name=binary_continuous_traits[phenotype_name]      

In [18]:
pheno=pd.read_csv(data_out_pheno_path+phenotype_name+'.phe',sep='\t',header=None,names=['FID','IID','pheno'])
phenotype_type='binary' if len(pheno['pheno'][pheno['pheno']!=-9].value_counts())<3 else 'continuous'
phenotype_type

'binary'

In [19]:
for a,i in enumerate(binary_continuous_traits):
    if not os.path.exists(data_out_gcta_path+i+'.HEreg'):
        print(a)
        #print(a)
        print(i,os.path.exists(data_out_gcta_path+i+'.HEreg'))

37
freet4 False
68
peripheral_vascular_diseases False
84
t2_diabetes False


In [20]:
log = logging.getLogger('logger')
log.setLevel(logging.DEBUG)

log_file_name=datetime.datetime.now().strftime('%Y%m%d_%H%M%S')+'.log'
log_file_path=data_out_gcta_path+log_file_name
fileHandler = logging.FileHandler(log_file_path)
streamHandler = logging.StreamHandler()

formatter = logging.Formatter(' %(asctime)s [%(levelname)s] %(lineno)d > %(message)s')
fileHandler.setFormatter(formatter)
streamHandler.setFormatter(formatter)

log.addHandler(fileHandler)
log.addHandler(streamHandler)

In [21]:
log.info("phenotype_name: {}, phenotype_type:{}".format(phenotype_name,phenotype_type))

 2020-07-09 03:57:01,697 [INFO] 1 > phenotype_name: diabetes, phenotype_type:binary


In [22]:
pheno[pheno['pheno']!=-9].to_csv(data_out_gcta_path+phenotype_name+'.phe',sep='\t',index=None,header=None)

In [23]:
if phenotype_type=='binary':
    with open(data_out_pheno_path+phenotype_name+'.phe'+'.prev','r') as f:
        prev=float(f.read())
    print('prev',prev)

prev 0.07085054068893079


In [24]:
log.info("#########################################  Run GCTA  #########################################")
#Run omnibus association test
command='gcta64 --HEreg --grm {} --pheno {} --out {} --thread-num 40'.format(grm_path,
                                                                             data_out_gcta_path+phenotype_name+'.phe',
                                                                             data_out_gcta_path+phenotype_name,                                         
                                                                            )
log.info(command)
stdout,stderr=run_subprocess(command,dry=False)
log.info(stdout)
log.error(stderr)    


 2020-07-09 03:57:04,018 [INFO] 1 > #########################################  Run GCTA  #########################################
 2020-07-09 03:57:04,021 [INFO] 7 > gcta64 --HEreg --grm data/genotype/4_merge/KCHIP_HLA_AA_SNP_1000G.grm --pheno data/out_gcta/diabetes.phe --out data/out_gcta/diabetes --thread-num 40


------RUN-----
gcta64 --HEreg --grm data/genotype/4_merge/KCHIP_HLA_AA_SNP_1000G.grm --pheno data/out_gcta/diabetes.phe --out data/out_gcta/diabetes --thread-num 40


 2020-07-09 04:03:10,455 [INFO] 9 > *******************************************************************
* Genome-wide Complex Trait Analysis (GCTA)
* version 1.92.2 beta Linux
* (C) 2010-2019, The University of Queensland
* Please report bugs to: Jian Yang <jian.yang@uq.edu.au>
*******************************************************************
Analysis started at 18:57:04 UTC on Wed Jul 08 2020.
Hostname: h3

Accepted options:
--HEreg
--grm data/genotype/4_merge/KCHIP_HLA_AA_SNP_1000G.grm
--pheno data/out_gcta/diabetes.phe
--out data/out_gcta/diabetes
--thread-num 40

Note: the program will be running on 40 threads.

Reading IDs of the GRM from [data/genotype/4_merge/KCHIP_HLA_AA_SNP_1000G.grm.grm.id].
125673 IDs read from [data/genotype/4_merge/KCHIP_HLA_AA_SNP_1000G.grm.grm.id].
Reading phenotypes from [data/out_gcta/diabetes.phe].
Non-missing phenotypes of 89355 individuals are included from [data/out_gcta/diabetes.phe].
89355 individuals are in common in these files.

Performing H

------ERROR-----

------OUTPUT-----
*******************************************************************
* Genome-wide Complex Trait Analysis (GCTA)
* version 1.92.2 beta Linux
* (C) 2010-2019, The University of Queensland
* Please report bugs to: Jian Yang <jian.yang@uq.edu.au>
*******************************************************************
Analysis started at 18:57:04 UTC on Wed Jul 08 2020.
Hostname: h3

Accepted options:
--HEreg
--grm data/genotype/4_merge/KCHIP_HLA_AA_SNP_1000G.grm
--pheno data/out_gcta/diabetes.phe
--out data/out_gcta/diabetes
--thread-num 40

Note: the program will be running on 40 threads.

Reading IDs of the GRM from [data/genotype/4_merge/KCHIP_HLA_AA_SNP_1000G.grm.grm.id].
125673 IDs read from [data/genotype/4_merge/KCHIP_HLA_AA_SNP_1000G.grm.grm.id].
Reading phenotypes from [data/out_gcta/diabetes.phe].
Non-missing phenotypes of 89355 individuals are included from [data/out_gcta/diabetes.phe].
89355 individuals are in common in these files.

Performing H

0.09964747356051704

In [48]:
#pheno0=pd.read_csv(data_out_assoc_path+phenotype_list[0]+'/'+'phenotype.phe',header=None,sep='\t',names=['FID','IID','pheno0'])
#pheno1=pd.read_csv(data_out_assoc_path+phenotype_list[1]+'/'+'phenotype.phe',header=None,sep='\t',names=['FID','IID','pheno1']);pheno0['pheno1']=pheno1['pheno1']
#pheno2=pd.read_csv(data_out_assoc_path+phenotype_list[1]+'/'+'phenotype.phe',header=None,sep='\t',names=['FID','IID','pheno2']);pheno0['pheno2']=pheno2['pheno2']

command='gcta64 --HEreg-bivar 1 2 --grm {} --pheno {} --out {} --thread-num 40'.format(final_plink_aa_grm_path,
                                                                            'temp.phe',
                                                                             'testout2'  
                                                                            )
command

gcta64 --HEreg-bivar 1 2 --grm data/genotype/4_merge/grm --pheno temp2.phe --out data/out_assoc/height/HE2 --thread-num 40

(pheno0['pheno0']!=-9).sum(),(pheno0['pheno1']!=-9).sum(),(pheno0['pheno2']!=-9).sum()

In [29]:
#!cat data/out_gcta/ALP.phe