# Description

It reads all GWASs in PhenomeXcan and counts how many variants are included in each one.

# Modules

In [1]:
from pathlib import Path

import pandas as pd

import conf

# Settings

In [2]:
GWAS_PARSING_BASE_DIR = conf.PHENOMEXCAN["BASE_DIR"] / "gwas_parsing"
display(GWAS_PARSING_BASE_DIR)
GWAS_PARSING_BASE_DIR.mkdir(exist_ok=True, parents=True)

PosixPath('/opt/data/data/phenomexcan/gwas_parsing')

In [3]:
GWAS_PARSING_N_LINES_DIR = GWAS_PARSING_BASE_DIR / "gwas_parsing_n_lines"
display(GWAS_PARSING_N_LINES_DIR)
GWAS_PARSING_N_LINES_DIR.mkdir(exist_ok=True, parents=True)

PosixPath('/opt/data/data/phenomexcan/gwas_parsing/gwas_parsing_n_lines')

In [4]:
GWAS_PARSING_INPUT_DIR = GWAS_PARSING_BASE_DIR / "full"
display(GWAS_PARSING_INPUT_DIR)
assert GWAS_PARSING_INPUT_DIR.exists()

PosixPath('/opt/data/data/phenomexcan/gwas_parsing/full')

# Read PhenomeXcan GWAS' number of variants

In [5]:
%%bash -s "$GWAS_PARSING_INPUT_DIR" "$GWAS_PARSING_N_LINES_DIR"
parallel -j3 zcat {} | wc -l > ${2}/{/.} ::: ${1}/*.txt.gz

In [6]:
files = list(GWAS_PARSING_N_LINES_DIR.glob("*.txt"))

In [7]:
len(files)

4197

In [8]:
# read number of lines per GWAS
gwas_n_vars = {}

for f in files:
    with open(f) as fh:
        gwas_n_vars[f.name.split(".txt")[0]] = int(fh.readlines()[0].strip())

In [9]:
df = pd.DataFrame.from_dict(gwas_n_vars, orient="index").squeeze()

In [10]:
df.shape

(4197,)

In [11]:
df.head()

M13_FOREIGNBODY    8496089
22617_7112         8496089
M13_ANKLEFOOT      8496089
22601_81212774     8496089
40001_J841         8496089
Name: 0, dtype: int64

# Save

In [12]:
output_file = GWAS_PARSING_BASE_DIR / "gwas_n_variants.pkl"
display(output_file)

PosixPath('/opt/data/data/phenomexcan/gwas_parsing/gwas_n_variants.pkl')

In [13]:
df.to_pickle(output_file)