In [1]:
import os
import sys

import logging
import re
import pathlib

import datetime

import numpy as np
import pandas as pd
from scipy.stats import pearsonr

import matplotlib.pyplot as plt

from pyplink import PyPlink

import glob
from basic_tools import *

In [2]:
plink_path=final_plink_path
plink_aa_path=final_plink_aa_path
aa_path=final_aa_path

In [3]:
plink=PyPlink(plink_path)
fam=plink.get_fam().astype({'fid':str,'iid':str}).rename(columns={'fid':'FID','iid':'IID'})
bim=plink.get_bim()

In [5]:
phenotypes=pd.read_csv(pheno_all_file_path,sep='\t')

phenotypes=phenotypes.set_index('ID').loc[fam.IID]
phenotypes.shape

(125673, 137)

In [6]:
binary_traits=phenotypes.columns[phenotypes.apply(lambda x: (not 'x_ray' in x.name) & (len(x.value_counts())<3),axis=0)]
binary_traits,len(binary_traits)

(Index(['sex', 'asthma', 'chronic_bronchitis', 'allergic_disease',
        'stomach_cancer', 'liver_cancer', 'colorectal_cancer', 'breast_cancer',
        'cervical_cancer', 'lung_cancer', 'thyroid_cancer', 'prostate_cancer',
        'bladder_cancer', 'colon_polyps',
        'angina_pectoris_myocardial_infarction', 'hypertension', 'stroke',
        'transient_ischemic_attacks', 'acute_liver_disease', 'diabetes',
        'hyperlipidemia', 'thyroid_disease', 'arthritis',
        'rheumatoid_arthritis', 'osteoporosis', 'parkinson',
        'chronic_gastritis_stomach_ulcer', 'duodenal_ulcer',
        'urinary_tract_infection', 'hysterectomy', 'ovarian_extraction',
        'big_baby', 'small_baby', 'blood_in_urine', 'glucose_in_urine',
        'heart_failure', 'peripheral_vascular_diseases', 'cataract', 'glaucoma',
        'renal_failure', 'fatty_liver', 'copd', 'tuberculosis',
        'benign_prostatic_hyperplasia', 'gout', 'fracture',
        'cholelithiasis_cholecystitis', 'periodontal_d

In [7]:
continuous_traits=phenotypes.columns[phenotypes.apply(lambda x: (not 'x_ray' in x.name) & (len(x.value_counts())>=3),axis=0)]
continuous_traits

Index(['age', 'pwi_total', 'height', 'hip_circumference',
       'waist_circumference', 'weight', 'systolic_blood_pressure',
       'diastolic_blood_pressure', 'pulse', 'menarche_onset_age',
       'menopause_age', 'menopause_cycle', 'child_delivery_number',
       'child_present', 'child_dead', 'hematocrit', 'hemoglobin', 'platelet',
       'rbc', 'wbc', 'mcv', 'mch', 'mchc', 'total_bilirubin', 'bun',
       'creatinine', 'alt', 'ast', 'gtp', 'total_cholesterol',
       'hdl_cholesterol', 'hba1c', 'freet4', 'tsh', 'triglyceride', 'albumin',
       'hscrp', 'protein_in_urine', 'renin', 'sleep_duration', 'grip_strength',
       'ALP', 'FVC_predicted', 'FEV_predicted', 'FEV_over_FVC_predicted',
       'MVV_predicted', 'bone_density_distal_sos', 'bone_density_midshaft_sos',
       'protein_in_blood', 'glucose_in_blood', 'cohort', 'bmi'],
      dtype='object')

In [8]:
binary_continuous_traits=sorted(binary_traits.union(continuous_traits))
len(binary_continuous_traits)

102

In [10]:
if 'ipykernel' in sys.argv[0]:
    ipykernel=True
    phenotype_name1='protein_in_blood'
    #phenotype_name='height'
else:
    ipykernel=False
    phenotype_name1=sys.argv[1]
    if phenotype_name1.isdigit():
        phenotype_name1=int(phenotype_name1)
        phenotype_name1=binary_continuous_traits[phenotype_name1]

In [15]:
data_out_gcta_path=data_path+'out_gcta/'

In [13]:
pheno1=pd.read_csv(data_out_assoc_path+phenotype_name1+'/'+'phenotype.phe',header=None,sep='\t',names=['FID','IID','pheno1'])
pheno1[pheno1['pheno1']!=-9].to_csv(data_out_gcta_path+phenotype_name1+'.phe')

In [17]:
command='gcta64 --HEreg --grm {} --pheno {} --out {} --thread-num 40'.format(final_plink_aa_grm_path,
                                                                           data_out_gcta_path+phenotype_name1+'.phe',
                                                                             data_out_gcta_path+phenotype_name1,                                         
                                                                            )
command

'gcta64 --HEreg --grm data/genotype/4_merge/KCHIP_HLA_AA_SNP_1000G_merged.grm --pheno data/out_gcta/protein_in_blood.phe --out data/out_gcta/protein_in_blood --thread-num 40'

In [51]:
run_subprocess(command)

'gcta64 --HEreg --grm data/genotype/4_merge/KCHIP_HLA_AA_SNP_1000G_merged.grm --pheno data/out_assoc/ALP/phenotype.phe --out data/out_assoc/ALP/HEreg --thread-num 40'

In [48]:
#pheno0=pd.read_csv(data_out_assoc_path+phenotype_list[0]+'/'+'phenotype.phe',header=None,sep='\t',names=['FID','IID','pheno0'])
#pheno1=pd.read_csv(data_out_assoc_path+phenotype_list[1]+'/'+'phenotype.phe',header=None,sep='\t',names=['FID','IID','pheno1']);pheno0['pheno1']=pheno1['pheno1']
#pheno2=pd.read_csv(data_out_assoc_path+phenotype_list[1]+'/'+'phenotype.phe',header=None,sep='\t',names=['FID','IID','pheno2']);pheno0['pheno2']=pheno2['pheno2']

In [50]:
#pheno0.to_csv('temp.phe',header=None,index=None,sep='\t')

command='gcta64 --HEreg-bivar 1 2 --grm {} --pheno {} --out {} --thread-num 40'.format(final_plink_aa_grm_path,
                                                                            'temp.phe',
                                                                             'testout2'  
                                                                            )
command

gcta64 --HEreg-bivar 1 2 --grm data/genotype/4_merge/grm --pheno temp2.phe --out data/out_assoc/height/HE2 --thread-num 40

(pheno0['pheno0']!=-9).sum(),(pheno0['pheno1']!=-9).sum(),(pheno0['pheno2']!=-9).sum()