# Description

It runs PLINK2 on GWAS results to check that the genomic inflation factor is withint acceptable limits.

# Modules

In [1]:
import re
import subprocess
from pathlib import Path
import tempfile
import shutil
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
import pandas as pd

import conf
from utils import chunker

# Paths

In [2]:
GWAS_DIR = conf.RESULTS["GLS_NULL_SIMS"] / "gwas"
display(GWAS_DIR)

PosixPath('/opt/data/results/gls/null_sims/gwas')

In [3]:
PLINK2 = conf.PLINK["EXECUTABLE_VERSION_2"]
display(PLINK2)

PosixPath('/opt/data/software/plink/plink2')

# GWAS results files

In [4]:
gwas_files = sorted(list(GWAS_DIR.glob("*.glm.linear")))
display(len(gwas_files))
display(gwas_files[:10])

10

[PosixPath('/opt/data/results/gls/null_sims/gwas/random.pheno0.glm.linear'),
 PosixPath('/opt/data/results/gls/null_sims/gwas/random.pheno1.glm.linear'),
 PosixPath('/opt/data/results/gls/null_sims/gwas/random.pheno10.glm.linear'),
 PosixPath('/opt/data/results/gls/null_sims/gwas/random.pheno100.glm.linear'),
 PosixPath('/opt/data/results/gls/null_sims/gwas/random.pheno101.glm.linear'),
 PosixPath('/opt/data/results/gls/null_sims/gwas/random.pheno102.glm.linear'),
 PosixPath('/opt/data/results/gls/null_sims/gwas/random.pheno103.glm.linear'),
 PosixPath('/opt/data/results/gls/null_sims/gwas/random.pheno104.glm.linear'),
 PosixPath('/opt/data/results/gls/null_sims/gwas/random.pheno105.glm.linear'),
 PosixPath('/opt/data/results/gls/null_sims/gwas/random.pheno106.glm.linear')]

# Check inflation factor

In [5]:
PAT = re.compile(
    r"Genomic inflation est\. lambda \(based on median chisq\) = (?P<inf_factor>[0-9\.]+)\."
)

In [6]:
# testing
input_text = """
PLINK v2.00a3LM 64-bit Intel (26 Apr 2022)     www.cog-genomics.org/plink/2.0/
(C) 2005-2022 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to plink2.log.
Options in effect:
  --adjust-file /opt/data/data/1000g/genotypes/gwas/random.pheno0.glm.linear test=ADD

Start time: Fri Apr 29 16:12:24 2022
64185 MiB RAM detected; reserving 32092 MiB for main workspace.
Using up to 4 compute threads.
--adjust: Genomic inflation est. lambda (based on median chisq) = 1.00316.
--adjust-file values (5923554 tests) written to plink2.adjusted .
End time: Fri Apr 29 16:12:33 2022
"""

m = PAT.search(input_text)
assert m.group("inf_factor") == "1.00316"

In [7]:
# testing
input_text = """
PLINK v2.00a3LM 64-bit Intel (26 Apr 2022)     www.cog-genomics.org/plink/2.0/
(C) 2005-2022 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to plink2.log.
Options in effect:
  --adjust-file base/data/1000g/genotypes/gwas/random.pheno1.glm.linear test=ADD

Start time: Fri Apr 29 12:19:51 2022
64185 MiB RAM detected; reserving 32092 MiB for main workspace.
Using up to 4 compute threads.
--adjust: Genomic inflation est. lambda (based on median chisq) = 1.
--adjust-file values (5923554 tests) written to plink2.adjusted .
End time: Fri Apr 29 12:19:59 2022
"""

m = PAT.search(input_text)
display(m.group("inf_factor"))
assert m.group("inf_factor") == "1"

'1'

In [8]:
def _compute_inflation_factor(gwas_files_group):
    res = {}
    for gwas_file in gwas_files_group:
        output_dir = Path(tempfile.mkdtemp(prefix="plink-adjust-"))
        output_file = output_dir / "outfile"
        result = subprocess.run(
            [
                PLINK2,
                "--adjust-file",
                str(gwas_file),
                "test=ADD",
                "--threads",
                str(conf.GENERAL["N_JOBS"]),
                "--out",
                str(output_file),
            ],
            stdout=subprocess.PIPE,
        )

        assert result.returncode == 0

        result_output = result.stdout.decode("utf-8")
        inf_factor = float(PAT.search(result_output).group("inf_factor"))
        res[gwas_file.name] = inf_factor

        # delete temporary folder
        shutil.rmtree(output_dir)

    return res

In [9]:
# testing
_gwas_file = gwas_files[0]
display(_gwas_file)

_tmp = _compute_inflation_factor([_gwas_file])
assert _tmp is not None
assert _gwas_file.name in _tmp
display(_tmp)
assert 1.005 >= _tmp[_gwas_file.name] >= 1.0

PosixPath('/opt/data/results/gls/null_sims/gwas/random.pheno0.glm.linear')

{'random.pheno0.glm.linear': 1.00316}

In [10]:
gwas_files_chunks = list(
    chunker(
        gwas_files,
        int(min(10, len(gwas_files) / conf.GENERAL["N_JOBS"])),
    )
)

In [11]:
len(gwas_files_chunks)

4

In [12]:
all_results = {}

with ProcessPoolExecutor(max_workers=conf.GENERAL["N_JOBS"]) as executor:
    tasks = [
        executor.submit(_compute_inflation_factor, chunk) for chunk in gwas_files_chunks
    ]
    for future in as_completed(tasks):
        res = future.result()
        all_results.update(res)

In [13]:
assert len(all_results) == len(gwas_files)

# Create dataframe

In [14]:
all_results_df = pd.Series(all_results, name="inflation_factor").rename_axis(
    "phenotype_code"
)

In [15]:
all_results_df.shape

(10,)

In [16]:
all_results_df.head()

phenotype_code
random.pheno0.glm.linear      1.00316
random.pheno1.glm.linear      1.00000
random.pheno10.glm.linear     1.00462
random.pheno100.glm.linear    1.00398
random.pheno101.glm.linear    1.00671
Name: inflation_factor, dtype: float64

# Checks

In [17]:
all_results_df.describe()

count    10.000000
mean      1.002978
std       0.003821
min       1.000000
25%       1.000000
50%       1.001580
75%       1.004460
max       1.011310
Name: inflation_factor, dtype: float64

In [18]:
assert all_results_df.min() >= 1.0
assert all_results_df.max() <= 1.04

In [19]:
all_results_df.sort_values(ascending=False).head(20)

phenotype_code
random.pheno106.glm.linear    1.01131
random.pheno101.glm.linear    1.00671
random.pheno10.glm.linear     1.00462
random.pheno100.glm.linear    1.00398
random.pheno0.glm.linear      1.00316
random.pheno105.glm.linear    1.00000
random.pheno104.glm.linear    1.00000
random.pheno103.glm.linear    1.00000
random.pheno102.glm.linear    1.00000
random.pheno1.glm.linear      1.00000
Name: inflation_factor, dtype: float64

# Save

In [20]:
all_results_df.to_csv(
    conf.RESULTS["GLS_NULL_SIMS"] / "random_pheno-gwas-inflation_factors.tsv", sep="\t"
)