In [11]:
import io
import os
import pandas as pd


def read_vcf(path):
    with open(path, 'r') as f:
        lines = [l for l in f if not l.startswith('##')]
    return pd.read_csv(
        io.StringIO(''.join(lines)),
        dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
               'QUAL': str, 'FILTER': str, 'INFO': str},
        sep='\t'
    ).rename(columns={'#CHROM': 'CHROM'})

In [12]:
df = read_vcf('shared_variants_between_CMT-H3896_CMT-H3978.vcf.gz 22-03-28-858')

In [13]:
df['combo'] = df['CHROM'] + '-' + df['POS'].astype(str) + '-' + df['REF'] + '-' + df['ALT']

In [14]:
df

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,CMT-H3896,combo
0,chr1,10109,rs376007522,A,T,0,LowGQX,SNVHPOL=4;MQ=26;CSQT=1|DDX11L1|NR_046018.2|ups...,GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL,"0/1:17:0:59:10:47,12:8,1:39,11:0:LowGQX:18,0,370",chr1-10109-A-T
1,chr1,10177,rs201752861,A,C,6,LowGQX,SNVHPOL=3;MQ=23;CSQT=1|DDX11L1|NR_046018.2|ups...,GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL,"0/1:38:0:8:4:5,3:2,3:3,0:2.1:LowGQX:40,0,105",chr1-10177-A-C
2,chr1,10250,rs199706086,A,C,21,LowGQX,SNVHPOL=4;MQ=33;CSQT=1|DDX11L1|NR_046018.2|ups...,GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL,"0/1:33:0:4:6:1,3:1,0:0,3:0.7:LowGQX:56,0,30",chr1-10250-A-C
3,chr1,10257,rs111200574,A,C,21,LowGQX,SNVHPOL=4;MQ=32;CSQT=1|DDX11L1|NR_046018.2|ups...,GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL,"0/1:47:1:5:6:2,3:2,0:0,3:1.4:LowGQX:56,0,45",chr1-10257-A-C
4,chr1,10616,rs376342519,CCGCCGTTGCAAAGGCGCGCCG,C,0,LowGQX;LowDepth,CIGAR=1M21D;RU=.;REFREP=1;IDREP=0;MQ=38;AF1000...,GT:GQ:GQX:DPI:AD:ADF:ADR:FT:PL,"0/1:3:1:5:0,0:0,0:0,0:LowGQX;LowDepth:0,0,0",chr1-10616-CCGCCGTTGCAAAGGCGCGCCG-C
...,...,...,...,...,...,...,...,...,...,...,...
3185824,chrUn_GL000218v1,160251,.,A,C,0,LowGQX,SNVHPOL=7;MQ=47;phyloP=0.24,GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL,"0/1:7:0:69:7:58,11:29,2:29,9:0:LowGQX:8,0,370",chrUn_GL000218v1-160251-A-C
3185825,chrUn_GL000218v1,160571,.,C,T,0,LowGQX,SNVHPOL=4;MQ=50;phyloP=-0.933,GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL,"0/1:20:1:53:0:45,8:24,4:21,4:0:LowGQX:22,0,370",chrUn_GL000218v1-160571-C-T
3185826,chrUn_GL000218v1,160662,.,A,G,157,PASS,SNVHPOL=2;MQ=41;phyloP=-0.954,GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL,"0/1:190:11:44:0:25,19:14,5:11,14:-12.2:PASS:19...",chrUn_GL000218v1-160662-A-G
3185827,chrUn_GL000218v1,160912,.,G,A,0,LowGQX,SNVHPOL=4;MQ=49;phyloP=-0.362,GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL,"0/1:12:0:54:3:47,7:17,5:30,2:0:LowGQX:14,0,370",chrUn_GL000218v1-160912-G-A


In [None]:
# filter so only "PASS" shows up in the FILTER column

In [15]:
pass_flag = (df['FILTER'] == 'PASS')

In [16]:
# filter for heterozygous variants

In [17]:
hetero = (df['CMT-H3896'].str.contains('0/1', na = False))

In [18]:
# running both the pass_flag and hetero filter

In [19]:
df.loc[pass_flag & hetero]

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,CMT-H3896,combo
10,chr1,14653,rs62635297,C,T,175,PASS,"SNVHPOL=3;MQ=30;cosmic=1|COSN19740038,1|COSN20...",GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL,"0/1:167:4:35:1:15,20:9,11:6,9:-22.1:PASS:210,0...",chr1-14653-C-T
19,chr1,19190,.,GC,G,69,PASS,CIGAR=1M1D;RU=C;REFREP=4;IDREP=3;MQ=21;CSQT=1|...,GT:GQ:GQX:DPI:AD:ADF:ADR:FT:PL,"0/1:107:17:15:9,6:8,6:1,0:PASS:104,0,167",chr1-19190-GC-G
22,chr1,28494,rs768288163,T,C,93,PASS,SNVHPOL=3;MQ=24;phyloP=-0.716;CSQT=1|WASH7P|NR...,GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL,"0/1:76:6:10:1:3,7:1,3:2,4:-13.5:PASS:128,0,73",chr1-28494-T-C
24,chr1,28558,rs371996741,C,T,106,PASS,SNVHPOL=5;MQ=26;cosmic=1|COSN6452112;phyloP=-1...,GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL,"0/1:73:5:11:0:3,8:1,3:2,5:-13.3:PASS:141,0,70",chr1-28558-C-T
26,chr1,28588,rs370050982,G,T,41,PASS,SNVHPOL=16;MQ=31;phyloP=0.06;CSQT=1|WASH7P|NR_...,GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL,"0/1:74:4:10:0:6,4:2,2:4,2:-8.5:PASS:76,0,113",chr1-28588-G-T
...,...,...,...,...,...,...,...,...,...,...,...
3185817,chrUn_GL000218v1,157762,.,A,G,125,PASS,SNVHPOL=6;MQ=47;phyloP=-0.453,GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL,"0/1:158:14:44:2:27,17:12,7:15,10:-15.4:PASS:16...",chrUn_GL000218v1-157762-A-G
3185818,chrUn_GL000218v1,157891,.,A,G,138,PASS,SNVHPOL=2;MQ=50;phyloP=-2.02,GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL,"0/1:171:14:36:2:19,17:5,11:14,6:-12.5:PASS:173...",chrUn_GL000218v1-157891-A-G
3185819,chrUn_GL000218v1,158493,.,T,C,182,PASS,SNVHPOL=2;MQ=51;phyloP=-0.508,GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL,"0/1:155:18:31:0:13,18:6,5:7,13:-17.7:PASS:217,...",chrUn_GL000218v1-158493-T-C
3185822,chrUn_GL000218v1,159871,.,T,A,112,PASS,SNVHPOL=3;MQ=55;phyloP=-0.789,GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL,"0/1:145:14:91:6:62,29:40,18:22,11:-15.4:PASS:1...",chrUn_GL000218v1-159871-T-A


In [10]:
df

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,CMT-H3896,combo
0,chr1,10109,rs376007522,A,T,0,LowGQX,SNVHPOL=4;MQ=26;CSQT=1|DDX11L1|NR_046018.2|ups...,GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL,"0/1:17:0:59:10:47,12:8,1:39,11:0:LowGQX:18,0,370",chr1-10109-A-T
1,chr1,10177,rs201752861,A,C,6,LowGQX,SNVHPOL=3;MQ=23;CSQT=1|DDX11L1|NR_046018.2|ups...,GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL,"0/1:38:0:8:4:5,3:2,3:3,0:2.1:LowGQX:40,0,105",chr1-10177-A-C
2,chr1,10250,rs199706086,A,C,21,LowGQX,SNVHPOL=4;MQ=33;CSQT=1|DDX11L1|NR_046018.2|ups...,GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL,"0/1:33:0:4:6:1,3:1,0:0,3:0.7:LowGQX:56,0,30",chr1-10250-A-C
3,chr1,10257,rs111200574,A,C,21,LowGQX,SNVHPOL=4;MQ=32;CSQT=1|DDX11L1|NR_046018.2|ups...,GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL,"0/1:47:1:5:6:2,3:2,0:0,3:1.4:LowGQX:56,0,45",chr1-10257-A-C
4,chr1,10616,rs376342519,CCGCCGTTGCAAAGGCGCGCCG,C,0,LowGQX;LowDepth,CIGAR=1M21D;RU=.;REFREP=1;IDREP=0;MQ=38;AF1000...,GT:GQ:GQX:DPI:AD:ADF:ADR:FT:PL,"0/1:3:1:5:0,0:0,0:0,0:LowGQX;LowDepth:0,0,0",chr1-10616-CCGCCGTTGCAAAGGCGCGCCG-C
...,...,...,...,...,...,...,...,...,...,...,...
3185824,chrUn_GL000218v1,160251,.,A,C,0,LowGQX,SNVHPOL=7;MQ=47;phyloP=0.24,GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL,"0/1:7:0:69:7:58,11:29,2:29,9:0:LowGQX:8,0,370",chrUn_GL000218v1-160251-A-C
3185825,chrUn_GL000218v1,160571,.,C,T,0,LowGQX,SNVHPOL=4;MQ=50;phyloP=-0.933,GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL,"0/1:20:1:53:0:45,8:24,4:21,4:0:LowGQX:22,0,370",chrUn_GL000218v1-160571-C-T
3185826,chrUn_GL000218v1,160662,.,A,G,157,PASS,SNVHPOL=2;MQ=41;phyloP=-0.954,GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL,"0/1:190:11:44:0:25,19:14,5:11,14:-12.2:PASS:19...",chrUn_GL000218v1-160662-A-G
3185827,chrUn_GL000218v1,160912,.,G,A,0,LowGQX,SNVHPOL=4;MQ=49;phyloP=-0.362,GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL,"0/1:12:0:54:3:47,7:17,5:30,2:0:LowGQX:14,0,370",chrUn_GL000218v1-160912-G-A


In [20]:
import gnomad

In [21]:
import hail as hl

In [25]:
ds = hl.import_vcf("/shared_variants_between_CMT-H3896_CMT-H3978.vcf.gz 22-03-28-858", reference_genome="GRCh37", drop_samples=True).rows()


2022-09-25 20:04:42 Hail: WARN: '/shared_variants_between_CMT-H3896_CMT-H3978.vcf.gz' refers to no files


FatalError: HailException: arguments refer to no files

Java stack trace:
is.hail.utils.HailException: arguments refer to no files
	at is.hail.utils.ErrorHandling.fatal(ErrorHandling.scala:17)
	at is.hail.utils.ErrorHandling.fatal$(ErrorHandling.scala:17)
	at is.hail.utils.package$.fatal(package.scala:78)
	at is.hail.io.vcf.LoadVCF$.globAllVCFs(LoadVCF.scala:1142)
	at is.hail.io.vcf.MatrixVCFReader$.apply(LoadVCF.scala:1570)
	at is.hail.io.vcf.MatrixVCFReader$.fromJValue(LoadVCF.scala:1666)
	at is.hail.expr.ir.MatrixReader$.fromJson(MatrixIR.scala:89)
	at is.hail.expr.ir.IRParser$.matrix_ir_1(Parser.scala:1726)
	at is.hail.expr.ir.IRParser$.$anonfun$matrix_ir$1(Parser.scala:1652)
	at is.hail.utils.StackSafe$More.advance(StackSafe.scala:64)
	at is.hail.utils.StackSafe$.run(StackSafe.scala:16)
	at is.hail.utils.StackSafe$StackFrame.run(StackSafe.scala:32)
	at is.hail.expr.ir.IRParser$.$anonfun$parse_matrix_ir$1(Parser.scala:1994)
	at is.hail.expr.ir.IRParser$.parse(Parser.scala:1979)
	at is.hail.expr.ir.IRParser$.parse_matrix_ir(Parser.scala:1994)
	at is.hail.backend.spark.SparkBackend.$anonfun$parse_matrix_ir$2(SparkBackend.scala:692)
	at is.hail.backend.ExecuteContext$.$anonfun$scoped$3(ExecuteContext.scala:70)
	at is.hail.utils.package$.using(package.scala:646)
	at is.hail.backend.ExecuteContext$.$anonfun$scoped$2(ExecuteContext.scala:70)
	at is.hail.utils.package$.using(package.scala:646)
	at is.hail.annotations.RegionPool$.scoped(RegionPool.scala:17)
	at is.hail.backend.ExecuteContext$.scoped(ExecuteContext.scala:59)
	at is.hail.backend.spark.SparkBackend.withExecuteContext(SparkBackend.scala:310)
	at is.hail.backend.spark.SparkBackend.$anonfun$parse_matrix_ir$1(SparkBackend.scala:691)
	at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:52)
	at is.hail.utils.ExecutionTimer$.logTime(ExecutionTimer.scala:59)
	at is.hail.backend.spark.SparkBackend.parse_matrix_ir(SparkBackend.scala:690)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:834)



Hail version: 0.2.100-2ea2615a797a
Error summary: HailException: arguments refer to no files