# Description

It gets the PCA transformation of an input file.

# Modules loading

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
from IPython.display import display

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import conf
from utils import generate_result_set_name

# Settings

## Input data

In [None]:
INPUT_FILEPATH_STEM = 'smultixcan-efo_partial-mashr-zscores'

In [None]:
INPUT_FILEPATH = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'z_score_std',
    f'z_score_std-{INPUT_FILEPATH_STEM}.pkl',
).resolve()
display(INPUT_FILEPATH)

input_filepath_stem = INPUT_FILEPATH.stem
display(input_filepath_stem)

## Output folder

In [None]:
# output dir for this notebook
RESULTS_DIR = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'pca'
).resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

## PCA options

In [None]:
# parameters of the dimentionality reduction steps
DR_OPTIONS = {
    'n_components': 50,
    'svd_solver': 'full',
    'random_state': 0,
}

In [None]:
# dictionary containing all options/settings (used to generate filenames)
ALL_OPTIONS = DR_OPTIONS.copy()

display(ALL_OPTIONS)

# Load input file

In [None]:
data = pd.read_pickle(INPUT_FILEPATH)

In [None]:
display(data.shape)

In [None]:
display(data.head())

# PCA

In [None]:
from data.dimreduction import get_pca_proj

In [None]:
dr_data = get_pca_proj(data, DR_OPTIONS)

In [None]:
display(dr_data.shape)

In [None]:
display(dr_data.head())

## Plot

In [None]:
g = sns.pairplot(data=dr_data.iloc[:,:5])

# Save

In [None]:
output_file = Path(
    RESULTS_DIR,
    generate_result_set_name(
        ALL_OPTIONS,
        prefix=f'pca-{input_filepath_stem}-',
        suffix='.pkl'
    )
).resolve()

display(output_file)

In [None]:
dr_data.to_pickle(output_file)