In [1]:
import pyranges as pr
import pandas as pd
from pyarrow.parquet import ParquetFile

In [2]:
#NEW
def get_vars_by_gene(var_mac_parquet_file, gene_id, gene_annotation, padding = 0, fillna = True):
    var_names = ParquetFile(var_mac_parquet_file).schema.names
    split_var_names = pd.Series(var_names[6:]).str.split(":", expand=True)
    
    variants = pr.from_dict({
        "Chromosome":split_var_names[0].astype(str), 
        "Start":split_var_names[1].astype(int), 
        "End":split_var_names[1].astype(int)+1,
        "var_name":pd.Series(var_names[6:])
    })
    
    gene_annotation_expanded = gene_annotation.copy()
    gene_annotation_expanded.Start = gene_annotation_expanded.Start - padding
    gene_annotation_expanded.End = gene_annotation_expanded.End + padding
    
    included_vars = variants.intersect(gene_annotation_expanded[gene_annotation_expanded.gene_id.str.startswith(gene_id)])
    included_vars = included_vars.as_df()["var_name"].to_list() if included_vars else []
    mac_df = pd.read_parquet(var_mac_parquet_file, columns = ["IID"]+included_vars)
    
    mac_df[included_vars] = mac_df[included_vars].fillna(mac_df[included_vars].median()).astype("Int8") if fillna else mac_df[included_vars].astype("Int8")
    return mac_df

phenocode = "50"
mac_index_vars_parquet = f"/s/project/uk_biobank/processed/clumping/{phenocode}/GWAS_variants_clumped_mac_{phenocode}.parquet.p"

In [3]:
gtf_file = "/s/genomes/Gencode/Gencode_human/release_34/GRCh37_mapping/gencode.v34lift37.annotation.gtf.gz"
genome_annotation = pr.read_gtf(gtf_file)
gene_annotation = genome_annotation[genome_annotation.Feature=="gene"]

In [4]:
res = get_vars_by_gene(mac_index_vars_parquet, "ENSG00000157766", gene_annotation, padding = 0)

In [5]:
res

Unnamed: 0,IID,chr15:89349539:G>A_G,chr15:89350458:G>T_G,chr15:89355333:G>C_G,chr15:89357521:A>G_A,chr15:89358887:T>A_T,chr15:89361142:A>G_A,chr15:89362861:G>T_G,chr15:89366519:C>T_C,chr15:89367307:A>G_A,...,chr15:89399167:C>T_C,chr15:89404982:A>G_A,chr15:89405052:C>A_C,chr15:89407522:A>G_A,chr15:89409408:C>A_C,chr15:89410998:G>A_G,chr15:89412156:G>A_G,chr15:89415247:C>G_C,chr15:89416426:G>A_G,chr15:89417902:A>C_A
0,-1_-1,0,2,2,2,2,2,2,2,0,...,1,2,2,1,2,2,2,2,2,2
1,-2_-2,1,2,2,2,2,2,2,2,0,...,2,2,1,1,2,2,2,2,2,2
2,-3_-3,2,2,2,2,2,2,2,2,1,...,0,2,2,2,2,2,2,1,2,2
3,-4_-4,1,2,2,1,2,2,2,2,1,...,1,2,2,2,2,2,2,2,2,2
4,-5_-5,2,2,2,2,2,1,2,2,1,...,1,2,2,2,2,2,2,0,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
487404,6026151_6026151,2,2,2,2,2,2,2,2,1,...,1,2,2,2,2,2,2,2,2,2
487405,6026166_6026166,1,2,2,1,2,2,2,2,1,...,1,2,2,2,2,2,2,2,2,2
487406,6026179_6026179,0,2,2,2,2,2,2,2,0,...,0,2,2,2,2,2,2,2,2,2
487407,6026184_6026184,2,2,2,2,2,1,2,2,0,...,0,2,2,2,2,2,2,2,2,2
