In [1]:
#!/usr/bin/env python
import pandas as pd
import torch
import tensorqtl
from tensorqtl import genotypeio, cis
import time
import os
import os.path as op
import sys
import glob
import re
print(f'PyTorch {torch.__version__}')
#print(torch.__version__) 
print('CUDA available: {} ({})'.format(torch.cuda.is_available(), torch.cuda.get_device_name(torch.cuda.current_device())))
print(f'Pandas {pd.__version__}')
print(f'tensorqtl {tensorqtl.__version__}')

outdir='eqtl_out'
if not os.path.exists(outdir):
    os.makedirs(outdir)

PyTorch 2.2.0+cu121
CUDA available: True (NVIDIA RTX A2000 12GB)
Pandas 2.1.4
tensorqtl 1.0.9


In [2]:
mapping_file="mapping_gt2rna.tab" # if this file exists, genotype IDs will be changed to their mappings in the 2nd columns
col_map = None
if op.exists(mapping_file):
    print(f"mapping genotype IDs to RNAseq sample IDs based on mapping file: {mapping_file}")
    mapping_df = pd.read_csv(mapping_file, sep="\t", header=None, names=["gt_id", "r_id"])
    col_map = dict(zip(mapping_df['gt_id'], mapping_df['r_id']))

mapping genotype IDs to RNAseq sample IDs based on mapping file: mapping_gt2rna.tab


In [3]:
## make sure the output files are .-delimited tokens 
## with gene/tx/exon/jx (feature) as the 2nd token
exprfiles=glob.glob('eqtl_input/*.*.expr.bed.gz')
feats = [os.path.basename(file).split('.')[1] for file in exprfiles]
exprdict = {os.path.basename(file).split('.')[1]: file for file in exprfiles}
for feat in feats:
    expres = exprdict[feat]
    covar=expres.replace("expr.bed.gz", "covars.txt")
    if not op.exists(covar):
       raise FileNotFoundError(error_message)("Covars file "+covar+" not found!")
print("features found: ",feats)

features found:  ['gene', 'tx']


In [4]:
plink =  'genotypes/hippo_subset'
if op.exists(plink+'.fam'): 
    print("Plink fam file found.")
else:
    print("Plink fam "+plink+".fam not found!")
# ---
pr = genotypeio.PlinkReader(plink)
genotype_df = pr.load_genotypes()
variant_df = pr.bim.set_index('snp')[['chrom', 'pos']]
print("Genotype dimensions:", end='')
print(genotype_df.shape)
# if a mapping file is given
# Check if all genotype_df column names are in the mapping file
#all_columns_found = genotype_df.columns.isin(mapping_df["current_column"]).all()

if col_map is not None:
  # Rename columns using the dictionary
  genotype_df.rename(columns=col_map, inplace=True)
# Now genotype_df contains the updated column names
print(genotype_df.iloc[:5, :7])  # Display the first few rows

Plink fam file found.


Mapping files: 100%|█████████████████████████████████████████████████████████████████████████| 3/3 [00:09<00:00,  3.26s/it]


Genotype dimensions:(11560560, 195)
iid               Br1004  Br1017  Br1033  Br1039  Br1053  Br1056  Br1092
snp                                                                     
chr10:10905:G:A        0       0       0       0       0       0       0
chr10:10943:G:C        0       0       0       0       0       0       0
chr10:12113:AC:A       0       0       0       0       0       0       1
chr10:14538:C:T        0       0       0       0       0       0       0
chr10:15649:T:TG       1       0       0       1       0       1       1


In [5]:
## Fix chromosomes (add the "chr" prefix) if needed:
if not variant_df.chrom.iloc[0].startswith('chr'):
   variant_df.chrom = [ 'chr' + chrom for chrom in variant_df.chrom]
## select chromosomes - to make sure we have the same chromosomes in our data for each expression dataset
variant_chrom = set(variant_df.chrom)


In [6]:
for feat in feats:
    print(f" Processing feature: {feat}")
    tag = feat + '_gwnom'
    expres = exprdict[feat]
    covar=expres.replace("expr.bed.gz", "covars.txt")
    covariates_df = pd.read_csv(covar, sep='\t', index_col=0).T
    print("Covariates dim:", end='')
    print(covariates_df.shape)
    #print(covariates_df.iloc[:5, :5])
    phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed(expres)
    print("Phenotype dimensions:", end='')
    print(phenotype_df.shape)
    ## use the same chromosome set
    express_chrom = set(phenotype_pos_df.chr)
    assert len(variant_chrom.intersection(express_chrom))>0
    if express_chrom - variant_chrom:
        chrom_filter = phenotype_pos_df.chr.isin(variant_chrom)
        if (len(chrom_filter)<phenotype_df.shape[0]):
          phenotype_df = phenotype_df[chrom_filter]
          phenotype_pos_df = phenotype_pos_df[chrom_filter]
    ## make sure we keep only the genotypes for the expression samples
    cols=phenotype_df.columns.tolist()
    geno_df=genotype_df.loc[:, cols]
    ## run tensorQTL:
    cis.map_nominal(geno_df, variant_df, phenotype_df, phenotype_pos_df, prefix = tag, covariates_df= covariates_df,
                maf_threshold=0.05, window=500000, output_dir= outdir, verbose=False)
print("All done.")

 Processing feature: gene
Covariates dim:(195, 25)
Phenotype dimensions:(24703, 195)
cis-QTL mapping: nominal associations for all variant-phenotype pairs
  * 195 samples
  * 24703 phenotypes
  * 25 covariates
  * 11560560 variants
  * applying in-sample 0.05 MAF filter
  * cis-window: ±500,000
  * checking phenotypes: 24703/24703
    ** dropping 70 phenotypes without variants in cis-window
  * Computing associations
    Mapping chromosome chr1


  genotype_var_t = genotype_res_t.var(1)


    time elapsed: 0.09 min
    * writing output
    Mapping chromosome chr2
    time elapsed: 0.22 min
    * writing output
    Mapping chromosome chr3
    time elapsed: 0.33 min
    * writing output
    Mapping chromosome chr4
    time elapsed: 0.41 min
    * writing output
    Mapping chromosome chr5
    time elapsed: 0.49 min
    * writing output
    Mapping chromosome chr6
    time elapsed: 0.58 min
    * writing output
    Mapping chromosome chr7
    time elapsed: 0.68 min
    * writing output
    Mapping chromosome chr8
    time elapsed: 0.76 min
    * writing output
    Mapping chromosome chr9
    time elapsed: 0.83 min
    * writing output
    Mapping chromosome chr10
    time elapsed: 0.89 min
    * writing output
    Mapping chromosome chr11
    time elapsed: 0.98 min
    * writing output
    Mapping chromosome chr12
    time elapsed: 1.07 min
    * writing output
    Mapping chromosome chr13
    time elapsed: 1.13 min
    * writing output
    Mapping chromosome chr14
    tim

  genotype_var_t = genotype_res_t.var(1)


    time elapsed: 0.39 min
    * writing output
    Mapping chromosome chr2
    time elapsed: 0.98 min
    * writing output
    Mapping chromosome chr3
    time elapsed: 1.48 min
    * writing output
    Mapping chromosome chr4
    time elapsed: 1.88 min
    * writing output
    Mapping chromosome chr5
    time elapsed: 2.24 min
    * writing output
    Mapping chromosome chr6
    time elapsed: 2.63 min
    * writing output
    Mapping chromosome chr7
    time elapsed: 3.03 min
    * writing output
    Mapping chromosome chr8
    time elapsed: 3.38 min
    * writing output
    Mapping chromosome chr9
    time elapsed: 3.68 min
    * writing output
    Mapping chromosome chr10
    time elapsed: 3.96 min
    * writing output
    Mapping chromosome chr11
    time elapsed: 4.34 min
    * writing output
    Mapping chromosome chr12
    time elapsed: 4.78 min
    * writing output
    Mapping chromosome chr13
    time elapsed: 5.05 min
    * writing output
    Mapping chromosome chr14
    tim