 - Updated batch filtering to include a differential missingness filter

In [1]:
%matplotlib inline
import os
import pandas as pd
import seaborn as sns
import numpy as np
import subprocess as sp
import matplotlib.pyplot as plt
from IPython.display import display

import sys
sys.path.append('../../code/gwas_analysis')
from gwas_plotting import qqplot

In [2]:
phenofn = 'gs://popgen-gwas/data/metadata/Annotated_GenotypedData/preANDpost2016_Merged_Complete_Metadata_WithAge_20210818.tsv'
metadata = pd.read_csv(phenofn, sep='\t')
metadata.index = metadata['Full_IID']
metadata.head()

Unnamed: 0_level_0,Full_IID,Genotype_Sex,ID_Reformat,SampleType,sex,Disease,ISTH_SandR_Key,KGH_Survivor_Key,KGH_AcuteLassa_Key,KGH_AcuteEbola_Key,...,Batch,Country,Sex_Discrepant,Sex_Discrepant_Notes,Status,Clean_HLA_ID,Epoch,Array,CollectionEpoch,Age
Full_IID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
July2014NG_SM-6C4M9,July2014NG_SM-6C4M9,1,SM-6C4M9,SuspectedLassaNG,1,Lassa,,,,,...,July2014NG,NG,False,,Passes QC,,Pre2016,Omni_25M_B,Pre2016,30.0
July2014NG_SM-4XBS9,July2014NG_SM-4XBS9,2,SM-4XBS9,Pre2016ControlNG,2,Lassa,,,,,...,July2014NG,NG,False,,Passes QC,,Pre2016,Omni_25M_B,Pre2016,26.0
July2014NG_SM-6C4MA,July2014NG_SM-6C4MA,1,SM-6C4MA,SuspectedLassaNG,1,Lassa,,,,,...,July2014NG,NG,False,,Passes QC,,Pre2016,Omni_25M_B,Pre2016,32.0
July2014NG_SM-6C4MB,July2014NG_SM-6C4MB,2,SM-6C4MB,SuspectedLassaNG,2,Lassa,,,,,...,July2014NG,NG,False,,Passes QC,,Pre2016,Omni_25M_B,Pre2016,65.0
July2014NG_SM-4XBSM,July2014NG_SM-4XBSM,2,SM-4XBSM,Pre2016ControlNG,2,Lassa,,,,,...,July2014NG,NG,False,,Passes QC,,Pre2016,Omni_25M_B,Pre2016,47.0


In [3]:
group_fn = 'gs://popgen-gwas/data/metadata/Annotated_GenotypedData/OmniH3Merged_PrePost2016_AggregatedLASVPhenotypes_20210829.tsv'
groups = pd.read_csv(group_fn, sep='\t')
groups.index = groups['#IID']
ind = groups.index[groups['Post2016_NG_LASV_qPCR2xPosOrSeqPos'] | groups['Post2016_NG_LASV_SeqPos'] | groups['Pre2016_NG_LASV_SeqPos'] | groups['Pre2016_SL_LASV_AgORSeqPos'] | groups['Post2016_SL_LASV_AgPos']]
metadata['LASV_Case'] = 0
metadata.loc[ind, 'LASV_Case'] = 1

In [4]:
a = '../../data/tmp/plink/MergePrePost/OmniH3Merged_PrePost2016.20210817.FiltSLOmni5.Geno_1e-1.filtBatchvars20220223'
dupfiltfn_fam = '../../data/tmp/plink/MergePrePost/OmniH3Merged_PrePost2016.20210720.Geno_1e-1.filtBatchvars20210720.DupsToExclude.txt'
dupstofilt = pd.read_csv(dupfiltfn_fam, sep='\t')['1']

In [5]:
pd.read_csv(a+'.bim', sep='\t', header=None)[0].value_counts()

2     115716
1     106801
3      95664
6      91532
4      90673
5      87044
7      76390
8      75365
10     71934
11     67723
12     66940
9      62578
13     48887
16     45412
14     45049
15     43575
18     41243
17     37369
20     32661
23     26316
19     26119
22     18854
21     18330
24        58
Name: 0, dtype: int64

## Prepare the VCF file for imputation

In [4]:
b = '../../data/tmp/plink/MergePrePost/OmniH3Merged_PrePost2016.20210817.FiltSLOmni5.Geno_1e-1.filtBatchvars20220223'

In [7]:
cmd = 'plink2 --bfile {b} --recode vcf-iid id-delim="|" --out {b} --keep-allele-order  --geno 0.1 --hwe 1e-12'.format(b=b)
print(cmd)
!{cmd}

#b= '{b}.maf01'.format(b=b)

## Convert 23-->X 24-->Y
vcf_fn = b+'.vcf'
vcf_fixchr_fn = b+'.renamesexchr.bcf'
plink2ensembl = '../../data/tmp/misc/plink2ensembl.txt'
cmd = 'bcftools annotate -Ou --rename-chrs %s %s > %s' % (plink2ensembl, vcf_fn, vcf_fixchr_fn)
print(cmd)
!{cmd}

vcf_fixchr_sort_fn = b+'.renamesexchr.sorted.bcf'
cmd = 'bcftools sort -Ob %s > %s' % (vcf_fixchr_fn, vcf_fixchr_sort_fn)
print(cmd)
!{cmd}

cmd = 'bcftools index %s' % vcf_fixchr_sort_fn
print(cmd)
!{cmd}

sampfn = vcf_fixchr_sort_fn.replace('.bcf', '.SexPloidy.txt')
cmd = 'bcftools +guess-ploidy -g b37 %s > %s' % (vcf_fixchr_sort_fn, sampfn)
print(cmd)
! {cmd}

vcf_fixploidy_fn = vcf_fixchr_sort_fn.replace('.bcf', '.fixPloidy.vcf.gz')
cmd = 'bcftools +fixploidy --threads 2 %s -Oz -o %s -- -s %s' % (vcf_fixchr_sort_fn, vcf_fixploidy_fn, sampfn)
print(cmd)
!{cmd}

## Check everything is fixed
ref_fn = '../../data/tmp/misc/human_g1k_v37.fasta'
cmd = 'bcftools +fixref %s -- -f %s' % (vcf_fixploidy_fn, ref_fn)
print(cmd)
!{cmd}

plink2 --bfile ../../data/tmp/plink/MergePrePost/OmniH3Merged_PrePost2016.20210817.FiltSLOmni5.Geno_1e-1.filtBatchvars20220223 --recode vcf-iid id-delim="|" --out ../../data/tmp/plink/MergePrePost/OmniH3Merged_PrePost2016.20210817.FiltSLOmni5.Geno_1e-1.filtBatchvars20220223 --keep-allele-order  --geno 0.1 --hwe 1e-12
PLINK v2.00a3LM 64-bit Intel (1 Jul 2021)      www.cog-genomics.org/plink/2.0/
(C) 2005-2021 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to ../../data/tmp/plink/MergePrePost/OmniH3Merged_PrePost2016.20210817.FiltSLOmni5.Geno_1e-1.filtBatchvars20220223.log.
Options in effect:
  --bfile ../../data/tmp/plink/MergePrePost/OmniH3Merged_PrePost2016.20210817.FiltSLOmni5.Geno_1e-1.filtBatchvars20220223
  --export vcf-iid id-delim=|
  --geno 0.1
  --hwe 1e-12
  --keep-allele-order
  --out ../../data/tmp/plink/MergePrePost/OmniH3Merged_PrePost2016.20210817.FiltSLOmni5.Geno_1e-1.filtBatchvars20220223

Start time: Wed Feb 23 15:34:02 2022
Note: --export 'v

In [6]:
! bcftools index ../../data/tmp/plink/MergePrePost/OmniH3Merged_PrePost2016.20210817.FiltSLOmni5.Geno_1e-1.filtBatchvars20220223.renamesexchr.sorted.fixPloidy.vcf.gz

In [None]:
outbase = '../../data/tmp/plink/MergePrePost/OmniH3Merged_PrePost2016.20210817.FiltSLOmni5.Geno_1e-1.filtBatchvars20220223'
gcsdir = 'gs://popgen-gwas/data/genotype/merge/Unimputed/'
for ext in ['bed', 'bim', 'fam', 'renamesexchr.sorted.fixPloidy.vcf.gz', 'renamesexchr.sorted.fixPloidy.vcf.gz.csi']:
    cmd = 'gsutil -m cp {a}.{ext} {gcs}'.format(a=outbase, ext=ext, gcs=gcsdir)
    print(cmd)
    !{cmd}