# Description

**TODO: UPDATE**

It read all gene-gene correlation matrices across chromosomes, performs some tests and saves a final, singla gene-gene correlation matrix.

# Modules

In [1]:
import os
from glob import glob
from pathlib import Path

from utils import read_log_file_and_check_line_exists
import conf

# Settings

Apparently, there is no easy way to get the parent directory of
a notebook in Jupyter, so here I get that information either from
the parameter sent by `nbs/run_nbs.sh` (if called from command-line) or
from `os.getcwd()` (if called from browser).

In [2]:
PHENOPLIER_NOTEBOOK_FILEPATH = None
PHENOPLIER_NOTEBOOK_DIR = os.getcwd()

In [3]:
# Parameters
PHENOPLIER_NOTEBOOK_FILEPATH = (
    "projects/asthma-copd/nbs/15_twas/01-run_spredixcan.ipynb"
)

In [4]:
if PHENOPLIER_NOTEBOOK_FILEPATH is not None:
    PHENOPLIER_NOTEBOOK_DIR = str(Path(PHENOPLIER_NOTEBOOK_FILEPATH).parent)

display(PHENOPLIER_NOTEBOOK_DIR)

'projects/asthma-copd/nbs/15_twas'

In [5]:
OUTPUT_DIR = conf.PROJECTS["ASTHMA_COPD"]["RESULTS_DIR"] / "twas" / "spredixcan"
display(OUTPUT_DIR)

OUTPUT_DIR_STR = str(OUTPUT_DIR)
display(OUTPUT_DIR_STR)

PosixPath('/opt/data/projects/asthma-copd/results/twas/spredixcan')

'/opt/data/projects/asthma-copd/results/twas/spredixcan'

# Run

In [6]:
%%bash -s "$PHENOPLIER_NOTEBOOK_DIR" "$OUTPUT_DIR_STR"
set -euo pipefail
# IFS=$'\n\t'

# read the notebook directory parameter and remove $1
export PHENOPLIER_NOTEBOOK_DIR="${PHENOPLIER_CODE_DIR}/$1"
shift

# read output dir
export OUTPUT_DIR="$1"
shift

run_job () {
    # run_job is a standard function name that performs a particular job
    # depending on the context. It will be called by GNU Parallel below.
    #
    # The implementation here runs the GLS model of PhenoPLIER on a trait.

    # read trait information
    # the first parameter to this function is a string with values separated by
    # commas (,). So here I split those into different variables.
    IFS=',' read -r pheno_id file sample_size n_cases tissue <<< "$1"

    INPUT_FILENAME=${file%.*}

    # get input GWAS file, there should be a single file
    # here I make sure that there are no other files in the folder that
    # match this phenotype/trait filename prefix
    GWAS_DIR="${PHENOPLIER_PROJECTS_ASTHMA_COPD_RESULTS_DIR}/final_imputed_gwas"
    N_GWAS_FILES=$(ls ${GWAS_DIR}/${INPUT_FILENAME}* | wc -l)
    if [ "${N_GWAS_FILES}" != "1" ]; then
        echo "ERROR: found ${N_GWAS_FILES} GWAS files instead of one"
        exit 1
    fi
    INPUT_GWAS_FILEPATH=$(ls ${GWAS_DIR}/${INPUT_FILENAME}*)

    # OUTPUT_DIR="${PHENOPLIER_PROJECTS_ASTHMA_COPD_RESULTS_DIR}/twas/spredixcan"
    mkdir -p "${OUTPUT_DIR}"

    # make sure we are not also parallelizing within numpy, etc
    export NUMBA_NUM_THREADS=1
    export MKL_NUM_THREADS=1
    export OPEN_BLAS_NUM_THREADS=1
    export NUMEXPR_NUM_THREADS=1
    export OMP_NUM_THREADS=1

    echo "Running for $pheno_id, $tissue"
    echo "Saving results in ${OUTPUT_DIR}"

    bash "${PHENOPLIER_CODE_DIR}/scripts/spredixcan.sh" \
        --input-gwas-file "${INPUT_GWAS_FILEPATH}" \
        --phenotype-name "${INPUT_FILENAME}" \
        --tissue "${tissue}" \
        --output-dir "${OUTPUT_DIR}" \
    | grep -iE "warning|error"

    echo
}

# export function so GNU Parallel can see it
export -f run_job

# generate a list of run_job calls for GNU Parallel
# here I read a file with information about traits (one trait per line)
while IFS= read -r line; do
    for tissue in ${PHENOPLIER_PHENOMEXCAN_PREDICTION_MODELS_MASHR_TISSUES}; do
        echo run_job "${line},${tissue}"
    done
done < <(tail -n "+2" "${PHENOPLIER_PROJECTS_ASTHMA_COPD_TRAITS_INFO_FILE}") |
    parallel -k --group --halt 2 -j${PHENOPLIER_GENERAL_N_JOBS}

Running for asthma_only, Thyroid
Saving results in /opt/data/projects/asthma-copd/results/twas/spredixcan

Running for asthma_only, Artery_Aorta
Saving results in /opt/data/projects/asthma-copd/results/twas/spredixcan

Running for asthma_only, Heart_Atrial_Appendage
Saving results in /opt/data/projects/asthma-copd/results/twas/spredixcan

Running for asthma_only, Liver
Saving results in /opt/data/projects/asthma-copd/results/twas/spredixcan

Running for asthma_only, Heart_Left_Ventricle
Saving results in /opt/data/projects/asthma-copd/results/twas/spredixcan

Running for asthma_only, Brain_Hippocampus
Saving results in /opt/data/projects/asthma-copd/results/twas/spredixcan

Running for asthma_only, Testis
Saving results in /opt/data/projects/asthma-copd/results/twas/spredixcan

Running for asthma_only, Uterus
Saving results in /opt/data/projects/asthma-copd/results/twas/spredixcan

Running for asthma_only, Adipose_Subcutaneous
Saving results in /opt/data/projects/asthma-copd/results/tw

# Perform some checks in output and log files

In [7]:
assert OUTPUT_DIR.exists()

In [8]:
log_files = OUTPUT_DIR.glob("*.log")

In [9]:
for f in log_files:
    read_log_file_and_check_line_exists(
        f,
        [
            "INFO - 90 % of model's snps found",
            "INFO - Sucessfully processed metaxcan association",
        ],
    )