In [1]:
import os
import pandas as pd

import json
import yaml

import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.types as t
import pyspark.sql.functions as f

import glow

In [2]:
snakefile_path = os.getcwd() + "/../Snakefile"
snakefile_path

'/data/nasif12/home_if12/hoelzlwi/Projects/ALS/workflow/notebooks/../Snakefile'

In [37]:
# del snakemake

In [43]:
try:
    snakemake
except NameError:
    from snakemk_util import load_rule_args

    snakemake = load_rule_args(
        snakefile = snakefile_path,
        rule_name = 'annotate_variants_with_vep',
        default_wildcards={
        }
    )

In [44]:
print(json.dumps(snakemake.__dict__, indent=2))

{
  "resources": {
    "_cores": 1,
    "_nodes": 1
  },
  "input": {
    "vcf": "/s/raw/als/kaggle/end-als/genomics-data/AnswerALS_subset_annovar.hg38_anno_and_geno.no_intergenic.vcf.gz",
    "gtf": "../data/common/Homo_sapiens.GRCh38.84.gtf",
    "fasta": "/s/genomes/human/hg38/GRCh38.primary_assembly.genome.fa"
  },
  "params": {
    "human_genome_version": "hg38",
    "assembly": "GRCh38",
    "vep_cache": "/opt/modules/i12g/conda-ensembl-vep/99/cachedir",
    "loftee_data_dir": "/s/raw/loftee/GRCh38",
    "cadd_dir": "/s/raw/cadd/v1.6/GRCh38",
    "gnomad_vcf": "/s/raw/gnomad/2.1.1/hg38/gnomad.genomes.r2.1.1.sites.vcf.gz",
    "vep_annotate_cmd": "/data/nasif12/home_if12/hoelzlwi/Projects/ALS/workflow/scripts/vep_99_skipheader.sh"
  },
  "output": {
    "vep": "/s/project/kaggle-als/vep_annotations/ensembl_bychr.parquet"
  },
  "wildcards": {}
}


In [21]:
MEM = os.popen("ulimit -m").read()
if MEM.startswith("unlimited"):
    print("Memory not constrained, using all available memory...")
    import psutil
    MEM = psutil.virtual_memory().available / 1024
MEM = int(MEM)

N_CPU = int(os.popen("nproc").read())

print("memory: %dk" % MEM)
print("number of cores: %d" % N_CPU)

memory: 184320000k
number of cores: 110


In [3]:
# MEM = int(MEM * 0.8)

In [22]:
os.environ['PYSPARK_SUBMIT_ARGS'] = " ".join([
    '--driver-memory %dk' % MEM,
    'pyspark-shell'
])
os.environ['PYSPARK_SUBMIT_ARGS']

MAX_FAILURES=4

spark = (
    SparkSession.builder
    .appName('desmi_inject_gnomad')
    .config("spark.jars.packages", ",".join([
        "io.projectglow:glow-spark3_2.12:1.0.0",
    ]))
    .config("spark.local.dir", os.environ.get("TMP"))
    .config("spark.master", f"local[{N_CPU},{MAX_FAILURES}]")
    .config("spark.sql.shuffle.partitions", "2001")
    .config("spark.sql.execution.arrow.enabled", "true")
    .config("spark.driver.maxResultSize", "48G")
    .config("spark.task.maxFailures", MAX_FAILURES)
    .getOrCreate()
)
glow.register(spark)
spark

In [29]:
INPUT_VCF  = snakemake.input["vcf"]
INPUT_VCF

'/s/raw/als/kaggle/end-als/genomics-data/AnswerALS_subset_annovar.hg38_anno_and_geno.no_intergenic.vcf.gz'

In [30]:
OUTPUT_PQ = snakemake.output["vep"]
OUTPUT_PQ

'/s/project/kaggle-als/vep_annotations/ensembl_bychr.parquet'

In [46]:
FASTA=snakemake.input["fasta"]
GTF=snakemake.input["gtf"]

HUMAN_GENOME_VERSION=snakemake.params["human_genome_version"]
ASSEMBLY=snakemake.params["assembly"]

VEP_CACHE=snakemake.params["vep_cache"]
LOFTEE_DATA_DIR=snakemake.params["loftee_data_dir"]
CADD_DIR=snakemake.params["cadd_dir"]
GNOMAD_VCF=snakemake.params["gnomad_vcf"]
VEP_CMD = snakemake.params["vep_annotate_cmd"]

VEP_PLUGIN_DIR=f"{VEP_CACHE}/Plugins/99_{ASSEMBLY}"

LOFTEE_PATH=f"{VEP_PLUGIN_DIR}/loftee"
MAXENTSCAN_DATA_DIR=f"{LOFTEE_PATH}/maxEntScan"

CADD_WGS_SNV=f"{CADD_DIR}/whole_genome_SNVs.tsv.gz"
CADD_INDEL={
    "GRCh37": f"{CADD_DIR}/InDels.tsv.gz",
    "GRCh38": f"{CADD_DIR}/gnomad.genomes.r3.0.indel.tsv.gz",
}[ASSEMBLY]

vep_cmd=" ".join([
#     "/opt/modules/i12g/anaconda/3-2019.10/bin/conda run --prefix /opt/modules/i12g/conda-ensembl-vep/99",
#     "vep",
    VEP_CMD,
    "--output_file STDOUT",
    "--format vcf",
    f"--cache --offline --dir={VEP_CACHE}",
    "--force_overwrite",
    "--no_stats",
    "--tab",
    "--merged",
    f"--assembly {ASSEMBLY}",
    f"--fasta {FASTA}",
    "--species homo_sapiens",
    "--everything",
    "--allele_number",
    "--total_length",
    "--numbers",
    "--symbol",
    "--hgvs",
    "--ccds",
    "--uniprot",
    "--af",
    "--af_1kg",
    "--af_esp",
    "--af_gnomad",
    "--max_af",
    "--pubmed",
    "--canonical",
    "--biotype",
    "--sift b",
    "--polyphen b",
    "--appris",
    "--domains",
    "--protein",
    "--regulatory",
    "--tsl",
# Does not really work with hg38
#     "--plugin " + ",".join([
#         "LoF",
#         f"loftee_path:{LOFTEE_PATH}",
#         f"gerp_bigwig:{LOFTEE_DATA_DIR}/gerp_conservation_scores.homo_sapiens.{ASSEMBLY}.bw",
#         f"human_ancestor_fa:{LOFTEE_DATA_DIR}/human_ancestor.fa.gz",
#         f"conservation_file:{LOFTEE_DATA_DIR}/loftee.sql",
#     ]),
#     f"--plugin LoF,human_ancestor_fa:{LOFTEE_DATA_DIR}/human_ancestor.fa.gz,conservation_file:{LOFTEE_DATA_DIR}/loftee.sql,loftee_path:{LOFTEE_PATH}",
    "--plugin Condel",
    f"--plugin MaxEntScan,{MAXENTSCAN_DATA_DIR}",
    "--plugin Blosum62",
    "--plugin miRNA",
    f"--plugin CADD,{CADD_WGS_SNV},{CADD_INDEL}",
])
#    --gtf $GTF
#    --plugin LoFtool
#    --plugin dbscSNV
#    --custom ${GNOMAD_VCF},Gnomad_2.1.1,vcf,overlap
#    --plugin AncestralAllele

print(vep_cmd)

/data/nasif12/home_if12/hoelzlwi/Projects/ALS/workflow/scripts/vep_99_skipheader.sh --output_file STDOUT --format vcf --cache --offline --dir=/opt/modules/i12g/conda-ensembl-vep/99/cachedir --force_overwrite --no_stats --tab --merged --assembly GRCh38 --fasta /s/genomes/human/hg38/GRCh38.primary_assembly.genome.fa --species homo_sapiens --everything --allele_number --total_length --numbers --symbol --hgvs --ccds --uniprot --af --af_1kg --af_esp --af_gnomad --max_af --pubmed --canonical --biotype --sift b --polyphen b --appris --domains --protein --regulatory --tsl --plugin Condel --plugin MaxEntScan,/opt/modules/i12g/conda-ensembl-vep/99/cachedir/Plugins/99_GRCh38/loftee/maxEntScan --plugin Blosum62 --plugin miRNA --plugin CADD,/s/raw/cadd/v1.6/GRCh38/whole_genome_SNVs.tsv.gz,/s/raw/cadd/v1.6/GRCh38/gnomad.genomes.r3.0.indel.tsv.gz


In [204]:
df = (
    spark
    .read
    .option("flattenInfoFields", False)
    .format('vcf')
    .load(INPUT_VCF)
)

In [205]:
df = glow.transform("split_multiallelics", df)

In [206]:
df.printSchema()

root
 |-- contigName: string (nullable = true)
 |-- start: long (nullable = true)
 |-- end: long (nullable = true)
 |-- names: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- referenceAllele: string (nullable = true)
 |-- alternateAlleles: array (nullable = false)
 |    |-- element: string (containsNull = true)
 |-- qual: double (nullable = true)
 |-- filters: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- splitFromMultiAllelic: boolean (nullable = false)
 |-- attributes: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- INFO_OLD_MULTIALLELIC: string (nullable = true)
 |-- genotypes: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- sampleId: string (nullable = true)
 |    |    |-- conditionalQuality: integer (nullable = true)
 |    |    |-- MQ0: integer (nullable = true)
 |    |    |-- alleleDepths: array (nullable = true)
 |    |    |    |

In [208]:
df = df.withColumn("names", f.array([f.concat(
    f.col('contigName'),
    f.lit(":"),
    f.col('start') + 1,
    f.lit(":"),
    f.col('referenceAllele'),
    f.lit(">"),
    f.col('alternateAlleles')[0]
)]))

In [240]:
df.limit(10).toPandas()

  Unsupported type in conversion to Arrow: ArrayType(StructType(List(StructField(sampleId,StringType,true),StructField(conditionalQuality,IntegerType,true),StructField(MQ0,IntegerType,true),StructField(alleleDepths,ArrayType(IntegerType,true),true),StructField(PID,StringType,true),StructField(phased,BooleanType,true),StructField(calls,ArrayType(IntegerType,true),true),StructField(PGT,StringType,true),StructField(phredLikelihoods,ArrayType(IntegerType,true),true),StructField(depth,IntegerType,true),StructField(AB,DoubleType,true))),false)
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.


Unnamed: 0,contigName,start,end,names,referenceAllele,alternateAlleles,qual,filters,splitFromMultiAllelic,attributes,INFO_OLD_MULTIALLELIC,genotypes
0,chr1,16377,16378,[chr1:16378:T>C],T,[C],53076.99,[VQSRTrancheSNP99.90to100.00],False,"{'BS2': '.', 'BS1': '.', 'BS4': '.', 'ExAC_non...",,"[(CGND-HDA-00187-b38, 99, None, [39, 19], None..."
1,chr1,16486,16487,[chr1:16487:T>C],T,[C],15370.21,[VQSRTrancheSNP99.00to99.90],False,"{'BS2': '.', 'BS1': '.', 'BS4': '.', 'ExAC_non...",,"[(CGND-HDA-00187-b38, 99, None, [61, 0], None,..."
2,chr1,16494,16495,[chr1:16495:G>C],G,[C],63161.88,[VQSRTrancheSNP99.90to100.00],False,"{'BS2': '.', 'BS1': '.', 'BS4': '.', 'ExAC_non...",,"[(CGND-HDA-00187-b38, 99, None, [67, 20], None..."
3,chr1,16533,16534,[chr1:16534:C>T],C,[T],19334.2,[VQSRTrancheSNP99.90to100.00],False,"{'BS2': '.', 'BS1': '.', 'BS4': '.', 'ExAC_non...",,"[(CGND-HDA-00187-b38, 94, None, [55, 11], None..."
4,chr1,16566,16567,[chr1:16567:G>C],G,[C],270.35,[VQSRTrancheSNP99.90to100.00],False,"{'BS2': '.', 'BS1': '.', 'BS4': '.', 'ExAC_non...",,"[(CGND-HDA-00187-b38, 99, None, [56, 0], None,..."
5,chr1,16570,16571,[chr1:16571:G>A],G,[A],3495.12,[VQSRTrancheSNP99.90to100.00],False,"{'BS2': '.', 'BS1': '.', 'BS4': '.', 'ExAC_non...",,"[(CGND-HDA-00187-b38, 92, None, [54, 3], None,..."
6,chr1,16575,16576,[chr1:16576:A>C],A,[C],52.18,[VQSRTrancheSNP99.90to100.00],False,"{'BS2': '.', 'BS1': '.', 'BS4': '.', 'ExAC_non...",,"[(CGND-HDA-00187-b38, 99, None, [41, 0], None,..."
7,chr1,16630,16631,[chr1:16631:T>C],T,[C],132.14,[VQSRTrancheSNP99.90to100.00],False,"{'BS2': '.', 'BS1': '.', 'BS4': '.', 'ExAC_non...",,"[(CGND-HDA-00187-b38, 99, None, [34, 0], None,..."
8,chr1,16681,16682,[chr1:16682:G>A],G,[A],1147.17,[VQSRTrancheSNP99.90to100.00],False,"{'BS2': '.', 'BS1': '.', 'BS4': '.', 'ExAC_non...",,"[(CGND-HDA-00187-b38, 37, None, [28, 5], None,..."
9,chr1,16687,16688,[chr1:16688:G>A],G,[A],1188.62,[VQSRTrancheSNP99.90to100.00],False,"{'BS2': '.', 'BS1': '.', 'BS4': '.', 'ExAC_non...",,"[(CGND-HDA-00187-b38, 30, None, [33, 3], None,..."


In [267]:
import json
import shlex

input_df = df.select([
    f.col('contigName'),
    f.col('start'),
    f.col('end'),
    f.col('names'),
    f.col('referenceAllele'),
    f.col('alternateAlleles'),
])

vep_transformed_df = glow.transform(
    "pipe",
    input_df,
#     cmd=json.dumps(shlex.split("cat | grep -v '^##'")),
    cmd=json.dumps(shlex.split(vep_cmd)),
    inputFormatter='vcf',
    inVcfHeader='infer',
    outputFormatter='csv',
#     outQuote="##",
    outHeader=True,
    outDelimiter="\t",
)

In [268]:
vep_transformed_df.printSchema()

root
 |-- #Uploaded_variation: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Allele: string (nullable = true)
 |-- Gene: string (nullable = true)
 |-- Feature: string (nullable = true)
 |-- Feature_type: string (nullable = true)
 |-- Consequence: string (nullable = true)
 |-- cDNA_position: string (nullable = true)
 |-- CDS_position: string (nullable = true)
 |-- Protein_position: string (nullable = true)
 |-- Amino_acids: string (nullable = true)
 |-- Codons: string (nullable = true)
 |-- Existing_variation: string (nullable = true)
 |-- ALLELE_NUM: string (nullable = true)
 |-- IMPACT: string (nullable = true)
 |-- DISTANCE: string (nullable = true)
 |-- STRAND: string (nullable = true)
 |-- FLAGS: string (nullable = true)
 |-- VARIANT_CLASS: string (nullable = true)
 |-- SYMBOL: string (nullable = true)
 |-- SYMBOL_SOURCE: string (nullable = true)
 |-- HGNC_ID: string (nullable = true)
 |-- BIOTYPE: string (nullable = true)
 |-- CANONICAL: string (nullable = 

In [269]:
parsed_df = (
    vep_transformed_df
    .withColumn("#Uploaded_variation", f.split(f.col("#Uploaded_variation"), '[:>]'))
    .select([
        f.col("#Uploaded_variation")[0].alias("chrom"),
        f.col("#Uploaded_variation")[1].alias("pos"),
        f.col("#Uploaded_variation")[2].alias("ref"),
        f.col("#Uploaded_variation")[3].alias("alt"),
        "*",
    ])
    .withColumn("ref", f.when(f.col("ref") == '-', "").otherwise(f.col("ref")))
    .withColumn("alt", f.when(f.col("alt") == '-', "").otherwise(f.col("alt")))
    .withColumn("CANONICAL", f.when(f.col("CANONICAL") == 'YES', True).otherwise(False))
    .withColumn("condel_prediction", f.regexp_replace(f.col("Condel"), "\\(.*", ""))
    .withColumn("condel_score", f.regexp_replace(f.col("Condel"), "(.*\\()|\\)", ""))
    .withColumn("sift_prediction", f.regexp_replace(f.col("SIFT"), "\\(.*", ""))
    .withColumn("sift_score", f.regexp_replace(f.col("SIFT"), "(.*\\()|\\)", ""))
    .withColumn("polyphen_prediction", f.regexp_replace(f.col("PolyPhen"), "\\(.*", ""))
    .withColumn("polyphen_score", f.regexp_replace(f.col("PolyPhen"), "(.*\\()|\\)", ""))
    .drop(
        "#Uploaded_variation", 
        "Location", 
        "Allele",
        "Condel",
        "SIFT",
        "PolyPhen",
    )
)
parsed_df.printSchema()

root
 |-- chrom: string (nullable = true)
 |-- pos: string (nullable = true)
 |-- ref: string (nullable = true)
 |-- alt: string (nullable = true)
 |-- Gene: string (nullable = true)
 |-- Feature: string (nullable = true)
 |-- Feature_type: string (nullable = true)
 |-- Consequence: string (nullable = true)
 |-- cDNA_position: string (nullable = true)
 |-- CDS_position: string (nullable = true)
 |-- Protein_position: string (nullable = true)
 |-- Amino_acids: string (nullable = true)
 |-- Codons: string (nullable = true)
 |-- Existing_variation: string (nullable = true)
 |-- ALLELE_NUM: string (nullable = true)
 |-- IMPACT: string (nullable = true)
 |-- DISTANCE: string (nullable = true)
 |-- STRAND: string (nullable = true)
 |-- FLAGS: string (nullable = true)
 |-- VARIANT_CLASS: string (nullable = true)
 |-- SYMBOL: string (nullable = true)
 |-- SYMBOL_SOURCE: string (nullable = true)
 |-- HGNC_ID: string (nullable = true)
 |-- BIOTYPE: string (nullable = true)
 |-- CANONICAL: boolea

In [270]:
# x = parsed_df.filter(f.col("Condel") != '-').limit(10).toPandas()
# x

In [271]:
dtypes = {
    "consequence": t.ArrayType(t.StringType()),
    "Existing_variation": t.ArrayType(t.StringType()),
    "ALLELE_NUM": t.IntegerType(),
    "DISTANCE": t.IntegerType(),
    "STRAND": t.ShortType(),
    "FLAGS": t.ArrayType(t.StringType()),
    "HGNC_ID": t.IntegerType(),
#     "CANONICAL": t.BooleanType(), # needs manual check if column equals "CANONICAL"
    "TREMBL": t.ArrayType(t.StringType()),
    "REFSEQ_MATCH": t.ArrayType(t.StringType()),
    "GENE_PHENO": t.BooleanType(),
    "sift_score": t.FloatType(),
    "polyphen_score": t.FloatType(),
    "EXON": t.ArrayType(t.IntegerType()),
    "INTRON": t.ArrayType(t.IntegerType()),
    "HGVS_OFFSET": t.IntegerType(),
    "AF": t.ArrayType(t.FloatType()),
    "AFR_AF": t.ArrayType(t.FloatType()),
    "AMR_AF": t.ArrayType(t.FloatType()),
    "EAS_AF": t.ArrayType(t.FloatType()),
    "EUR_AF": t.ArrayType(t.FloatType()),
    "SAS_AF": t.ArrayType(t.FloatType()),
    "AA_AF": t.ArrayType(t.FloatType()),
    "EA_AF": t.ArrayType(t.FloatType()),
    "gnomAD_AF": t.ArrayType(t.FloatType()),
    "gnomAD_AFR_AF": t.ArrayType(t.FloatType()),
    "gnomAD_AMR_AF": t.ArrayType(t.FloatType()),
    "gnomAD_ASJ_AF": t.ArrayType(t.FloatType()),
    "gnomAD_EAS_AF": t.ArrayType(t.FloatType()),
    "gnomAD_FIN_AF": t.ArrayType(t.FloatType()),
    "gnomAD_NFE_AF": t.ArrayType(t.FloatType()),
    "gnomAD_OTH_AF": t.ArrayType(t.FloatType()),
    "gnomAD_SAS_AF": t.ArrayType(t.FloatType()),
    "MAX_AF": t.FloatType(),
    "MAX_AF_POPS": t.ArrayType(t.StringType()),
    "PUBMED": t.ArrayType(t.StringType()),
    "MOTIF_POS": t.IntegerType(),
    "MOTIF_SCORE_CHANGE": t.FloatType(),
    "Condel": t.StringType(),
    "condel_score": t.FloatType(),
    "condel_prediction": t.StringType(),
    "BLOSUM62": t.IntegerType(),
    "LoF": t.StringType(),
    "LoF_filter": t.StringType(),
    "LoF_flags": t.StringType(),
    "MaxEntScan_ref": t.FloatType(),
    "MaxEntScan_alt": t.FloatType(),
    "MaxEntScan_diff": t.FloatType(),
    "CADD_PHRED": t.FloatType(),
    "CADD_RAW": t.FloatType(),
}

In [272]:
needsMinVal = {
    "AF",
    "AFR_AF",
    "AMR_AF",
    "EAS_AF",
    "EUR_AF",
    "SAS_AF",
    "AA_AF",
    "EA_AF",
    "gnomAD_AF",
    "gnomAD_AFR_AF",
    "gnomAD_AMR_AF",
    "gnomAD_ASJ_AF",
    "gnomAD_EAS_AF",
    "gnomAD_FIN_AF",
    "gnomAD_NFE_AF",
    "gnomAD_OTH_AF",
    "gnomAD_SAS_AF",
}

In [273]:
def parse_col(name):
    col = f.col(name)
    col = f.when(col == '-', None).otherwise(col)
    if name in dtypes:
        dtype = dtypes[name]
        if isinstance(dtype, t.ArrayType):
            col = f.split(col, ",")
        col = col.cast(dtypes[name])
    if name in needsMinVal:
        col = f.array_min(col)
    
    return col.alias(name)

In [274]:
parsed_df = parsed_df.select(*[
    parse_col(x) for x in parsed_df.columns
])

In [275]:
parsed_df.printSchema()

root
 |-- chrom: string (nullable = true)
 |-- pos: string (nullable = true)
 |-- ref: string (nullable = true)
 |-- alt: string (nullable = true)
 |-- Gene: string (nullable = true)
 |-- Feature: string (nullable = true)
 |-- Feature_type: string (nullable = true)
 |-- Consequence: string (nullable = true)
 |-- cDNA_position: string (nullable = true)
 |-- CDS_position: string (nullable = true)
 |-- Protein_position: string (nullable = true)
 |-- Amino_acids: string (nullable = true)
 |-- Codons: string (nullable = true)
 |-- Existing_variation: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ALLELE_NUM: integer (nullable = true)
 |-- IMPACT: string (nullable = true)
 |-- DISTANCE: integer (nullable = true)
 |-- STRAND: short (nullable = true)
 |-- FLAGS: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- VARIANT_CLASS: string (nullable = true)
 |-- SYMBOL: string (nullable = true)
 |-- SYMBOL_SOURCE: string (nullable = true)
 |-- 

In [266]:
parsed_df.filter(~ f.isnull("condel_score")).limit(10).toPandas()

Unnamed: 0,chrom,pos,ref,alt,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,...,BLOSUM62,miRNA83,CADD_PHRED,CADD_RAW,condel_prediction,condel_score,sift_prediction,sift_score,polyphen_prediction,polyphen_score
0,chr1,69134,A,G,ENSG00000186092,ENST00000335137,Transcript,missense_variant,80/1054,44/918,...,-2,,16.91,1.683178,neutral,0.319,tolerated,0.07,benign,0.0
1,chr1,69134,A,G,ENSG00000186092,ENST00000641515,Transcript,missense_variant,167/2618,107/981,...,-2,,16.91,1.683178,neutral,0.322,tolerated,0.07,benign,0.045
2,chr1,69134,A,G,79501,NM_001005484.1,Transcript,missense_variant,44/918,44/918,...,-2,,16.91,1.683178,neutral,0.319,tolerated,0.07,benign,0.0
3,chr1,69428,T,G,ENSG00000186092,ENST00000335137,Transcript,missense_variant,374/1054,338/918,...,-2,,24.5,3.441204,deleterious,0.836,deleterious,0.01,probably_damaging,0.984
4,chr1,69428,T,G,ENSG00000186092,ENST00000641515,Transcript,missense_variant,461/2618,401/981,...,-2,,24.5,3.441204,deleterious,0.883,deleterious,0.02,probably_damaging,1.0
5,chr1,69428,T,G,79501,NM_001005484.1,Transcript,missense_variant,338/918,338/918,...,-2,,24.5,3.441204,deleterious,0.836,deleterious,0.01,probably_damaging,0.984
6,chr1,69511,A,G,ENSG00000186092,ENST00000335137,Transcript,missense_variant,457/1054,421/918,...,0,,2.209,0.112916,neutral,0.001,tolerated,0.82,benign,0.0
7,chr1,69511,A,G,ENSG00000186092,ENST00000641515,Transcript,missense_variant,544/2618,484/981,...,0,,2.209,0.112916,neutral,0.001,tolerated,0.85,benign,0.0
8,chr1,69511,A,G,79501,NM_001005484.1,Transcript,missense_variant,421/918,421/918,...,0,,2.209,0.112916,neutral,0.001,tolerated,0.82,benign,0.0
9,chr1,69761,A,T,ENSG00000186092,ENST00000335137,Transcript,missense_variant,707/1054,671/918,...,-3,,14.75,1.35154,neutral,0.406,deleterious,0.02,benign,0.229


In [276]:
# (
#     parsed_df
#     .filter(f.col("Gene").startswith("ENSG"))
#     .sortWithinPartitions(["chrom", "pos"])
#     .write
#     .mode("overwrite")
#     .partitionBy("Gene")
#     .parquet(OUTPUT_PATH + "/ensembl_genes.parquet")
# )

In [277]:
(
    parsed_df
    .filter(f.col("Gene").startswith("ENSG"))
    .sortWithinPartitions(["chrom", "Gene", "pos"]) # hierarchical sorting
    .write
    .mode("overwrite")
    .partitionBy("chrom")
    .parquet(OUTPUT_PQ)
)