In [1]:
#!/usr/bin/env python
import pandas as pd
import torch
import tensorqtl
from tensorqtl import pgen, cis
import time
import os
import os.path as op
import sys
import glob
import re
print(f'PyTorch {torch.__version__}')
#print(torch.__version__)
print('CUDA available: {} ({})'.format(torch.cuda.is_available(), torch.cuda.get_device_name(torch.cuda.current_device())))
print(f'Pandas {pd.__version__}')
print(f'tensorqtl {tensorqtl.__version__}')

outdir='eqtl_out'
if not os.path.exists(outdir):
    os.makedirs(outdir)

PyTorch 2.5.1+cu124
CUDA available: True (NVIDIA GeForce RTX 4070 Ti SUPER)
Pandas 2.2.3
tensorqtl 1.0.10


In [2]:
## ---- CHANGE HERE --------------
in_dir='eqtl_inputs' ## directory path where the tensorqtl input files can be found:
                     ## dsname.gene.expr.bed.gz and dsname.covars.txt
dsname='habenula' ## dataset name, this is the prefix for input and output file names
plink='genotypes/habenula_maf05' ## plink prefix for genotype data
#plink='genotypes/mdd_maf01'
mapping_file=None ## comment the line below if genotype IDs are the same with RNAseq sample IDs.
#mapping_file='genoID2rnaID.tab' # genotype IDs will be changed to their RNAseq mappings in the 2nd column
#mapping_file='genotypes/mdd_geno2rna.tab'
## if not None, mapping_file must be the same with the one used for 02_prep_tensorQTL_RSE_input.R
col_map = None
if mapping_file is not None and op.exists(mapping_file):
    print(f"mapping genotype IDs to RNAseq sample IDs based on mapping file: {mapping_file}")
    mapping_df = pd.read_csv(mapping_file, sep="\t", header=None, names=["gt_id", "r_id"])
    col_map = dict(zip(mapping_df['gt_id'], mapping_df['r_id']))
if op.exists(plink+'.pgen'): # checks for plink2 pgen file for genotype data
    print("Plink2 pgen file found.")
else:
    raise Exception("Plink2 "+plink+".pgen not found!")

exprfiles=glob.glob(in_dir+'/'+dsname+'.*.expr.bed.gz')
features = [os.path.basename(file).split('.')[1] for file in exprfiles]
exprdict = {os.path.basename(file).split('.')[1]: file for file in exprfiles}
for feat in features:
    expres = exprdict[feat]
    covar=expres.replace("expr.bed.gz", "covars.txt")
    if not op.exists(covar):
       raise FileNotFoundError(error_message)("Covars file "+covar+" not found!")
print("Features found: ",features)
print(" Change the `features` array below to set the features to be processed:")
## CHANGE here and uncomment, if needed, in order to select the features to process:
features = ['gene']
print("Features to process: ", features)

Plink2 pgen file found.
Features found:  ['gene']
 Change the `features` array below to set the features to be processed:
Features to process:  ['gene']


In [3]:
#pr = genotypeio.PlinkReader(plink)
## use plink2 pgen format
pgr = pgen.PgenReader(plink)
genotype_df = pgr.load_genotypes()
#variant_df = pr.bim.set_index('snp')[['chrom', 'pos']]
variant_df = pgr.variant_df
print("Genotype dimensions: ", end='')
print(genotype_df.shape)
# if a mapping file is given
# Check if all genotype_df column names are in the mapping file
#all_columns_found = genotype_df.columns.isin(mapping_df["current_column"]).all()

if col_map is not None:
  # Rename columns using the dictionary
  genotype_df.rename(columns=col_map, inplace=True)
## Now genotype_df contains the updated column names
#print(genotype_df.iloc[:5, :7])  # Display the first few rows



Genotype dimensions: (5504021, 69)


In [15]:
def fixBrNums(col):
    return re.sub(r'^Br(\d\d\d)$', r'Br0\1', col)

## fix Brnums with 3 digits
genotype_df.columns = [fixBrNums(col) for col in genotype_df.columns]


##  etc. Fix chromosomes (add the "chr" prefix) if needed:
if not variant_df.chrom.iloc[0].startswith('chr'):
   variant_df.chrom = [ 'chr' + chrom for chrom in variant_df.chrom]
## select chromosomes - to make sure we have the same chromosomes in our data for each expression dataset
variant_chrom = set(variant_df.chrom)

for feat in features:
    print(f" Processing feature: {feat}")
    tag = dsname+'.'+feat
    expres = exprdict[feat]
    covar=expres.replace("expr.bed.gz", "covars.txt")
    covariates_df = pd.read_csv(covar, sep='\t', index_col=0).T
    ## fix Brnums with 3 digits
    covariates_df.index = [fixBrNums(idx) for idx in covariates_df.index]

    print("Covariates dim:", end='')
    print(covariates_df.shape)
    #print(covariates_df.iloc[:5, :5])
    phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed(expres)
    print("Phenotype dimensions:", end='')
    print(phenotype_df.shape)
    ## fix Brnums with 3 digits
    phenotype_df.columns = [fixBrNums(col) for col in phenotype_df.columns]


    ## use the same chromosome set
    express_chrom = set(phenotype_pos_df.chr)
    assert len(variant_chrom.intersection(express_chrom))>0
    if express_chrom - variant_chrom:
        chrom_filter = phenotype_pos_df.chr.isin(variant_chrom)
        if (len(chrom_filter)<phenotype_df.shape[0]):
          phenotype_df = phenotype_df[chrom_filter]
          phenotype_pos_df = phenotype_pos_df[chrom_filter]
    ## make sure we keep only the genotypes for the same expression samples
    cols=phenotype_df.columns.tolist()
    gcols=genotype_df.columns.tolist()
    ## which genotypes are in the expression data? print them here if any are missing and stop the process
    missing_geno = set(cols) - set(gcols)
    if missing_geno:
        print(f"Genotypes missing for {len(missing_geno)} samples: {missing_geno}")
        raise RuntimeError("Missing genotypes detected. Aborting...")
    ## show if any genotypes are not in the expression data:
    missing_expr = set(gcols) - set(cols)
    if missing_expr:
        print(f"These genotypes have no expression data given: {missing_expr}")
    geno_df=genotype_df.loc[:, cols]
    ## run tensorQTL:
    cis.map_nominal(geno_df, variant_df, phenotype_df, phenotype_pos_df, prefix = tag, covariates_df= covariates_df,
                maf_threshold=0.05, window=500000, output_dir= outdir, verbose=False)
print("All done.")

 Processing feature: gene
Covariates dim:(68, 19)
Phenotype dimensions:(22756, 68)
These genotypes have no expression data given: {'Br5572'}
cis-QTL mapping: nominal associations for all variant-phenotype pairs
  * 68 samples
  * 22756 phenotypes
  * 19 covariates
  * 5504021 variants
  * applying in-sample 0.05 MAF filter
  * cis-window: ±500,000
    ** dropping 65 phenotypes on chrs. without genotypes
  * checking phenotypes: 22691/22691
    ** dropping 140 phenotypes without variants in cis-window
  * Computing associations
    Mapping chromosome chr1
    time elapsed: 0.04 min
    * writing output
    Mapping chromosome chr2
    time elapsed: 0.09 min
    * writing output
    Mapping chromosome chr3
    time elapsed: 0.14 min
    * writing output
    Mapping chromosome chr4
    time elapsed: 0.17 min
    * writing output
    Mapping chromosome chr5
    time elapsed: 0.20 min
    * writing output
    Mapping chromosome chr6
    time elapsed: 0.24 min
    * writing output
    Mapping

  genotype_var_t = genotype_res_t.var(1)


    time elapsed: 0.72 min
    * writing output
done.
All done.


In [12]:
# Print columns of phenotype_df and index of covariates_df for debugging
print("Phenotype columns:", phenotype_df.columns.tolist())
print("Covariates index:", covariates_df.index.tolist())

Phenotype columns: ['Br6323', 'Br1016', 'Br2421', 'Br1637', 'Br1842', 'Br0983', 'Br1203', 'Br1738', 'Br1507', 'Br1682', 'Br1383', 'Br1427', 'Br1416', 'Br1425', 'Br1487', 'Br1526', 'Br2292', 'Br1676', 'Br2378', 'Br2589', 'Br6104', 'Br6070', 'Br8048', 'Br6158', 'Br5873', 'Br5234', 'Br5891', 'Br5488', 'Br5581', 'Br5319', 'Br5398', 'Br5412', 'Br5573', 'Br5446', 'Br5888', 'Br1034', 'Br1092', 'Br1204', 'Br1378', 'Br1469', 'Br1565', 'Br1735', 'Br1761', 'Br2015', 'Br2080', 'Br2476', 'Br1023', 'Br1980', 'Br2044', 'Br2052', 'Br2425', 'Br5292', 'Br5385', 'Br5459', 'Br5555', 'Br5558', 'Br5639', 'Br5702', 'Br5712', 'Br5756', 'Br5871', 'Br6197', 'Br6264', 'Br8050', 'Br8218', 'Br5212', 'Br1350', 'Br1225']
Covariates index: ['Br6323', 'Br1016', 'Br2421', 'Br1637', 'Br1842', 'Br983', 'Br1203', 'Br1738', 'Br1507', 'Br1682', 'Br1383', 'Br1427', 'Br1416', 'Br1425', 'Br1487', 'Br1526', 'Br2292', 'Br1676', 'Br2378', 'Br2589', 'Br6104', 'Br6070', 'Br8048', 'Br6158', 'Br5873', 'Br5234', 'Br5891', 'Br5488', 'B

In [14]:
phenotype_df.head()

Unnamed: 0_level_0,Br6323,Br1016,Br2421,Br1637,Br1842,Br0983,Br1203,Br1738,Br1507,Br1682,...,Br5712,Br5756,Br5871,Br6197,Br6264,Br8050,Br8218,Br5212,Br1350,Br1225
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000278267.1,-2.171784,-1.83784,-0.411353,-0.382676,-1.126355,-1.108615,-2.333769,-1.184872,-1.403475,-0.928317,...,-0.7926,-1.625827,-1.697315,-1.014606,-0.021202,-1.78294,-0.509058,-1.119866,-1.295406,-1.880302
ENSG00000227232.5,1.062472,1.551763,1.73671,2.369074,1.384064,1.690661,1.704066,1.359794,1.96383,2.500164,...,1.828595,1.076911,2.074495,1.671447,2.219662,0.986524,1.593278,2.311974,1.520791,0.378487
ENSG00000279457.3,2.471219,2.521775,2.088978,2.160759,2.30049,2.3091,1.647845,1.919035,2.72532,2.59268,...,1.860885,1.701429,2.345663,2.379377,2.561471,1.152476,2.785751,2.422692,1.988977,2.241173
ENSG00000228463.9,1.199063,2.210713,1.474934,1.703086,0.965648,1.154444,1.433372,0.778913,2.120458,1.00924,...,0.465975,-0.831128,1.577355,1.75596,1.707017,2.125105,1.651893,2.008958,1.520791,-0.063116
ENSG00000236679.2,-1.219832,-0.212262,-0.817122,-0.495877,0.069631,-0.026137,-0.27416,-0.9982,0.144013,-0.595708,...,-1.967343,-0.321649,-0.634523,-1.325979,-0.255546,-0.910134,-1.917174,-0.49881,-0.965249,-1.4847
