### Goal

Model the gene expression imputation using not only the individual effect (PCA over the genes) but also gene effect (PCA over samples in the training set).

### TODO
1. build model
2. interprete the results

### Conclusions

In [3]:
import seaborn as sns
from matplotlib import rcParams
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.externals import joblib
import sklearn as sk
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LassoLarsCV, LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA, KernelPCA, IncrementalPCA
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.compose import ColumnTransformer

import rep.preprocessing_new as prep
import rep.datasets as d
import rep.models as m
import warnings; warnings.simplefilter('ignore')

In [2]:
# top candidates

top_individuals = ['GTEX-ZAB4',
                    'GTEX-12WSD',
                     'GTEX-13OW6',
                     'GTEX-131YS',
                     'GTEX-NPJ8',
                     'GTEX-11GSP',
                     'GTEX-YFC4',
                     'GTEX-13OVJ',
                     'GTEX-13OW8',
                     'GTEX-12ZZX']
top_highvariance_genes = ['ENSG00000171195.10',
                         'ENSG00000124233.11',
                         'ENSG00000169344.15',
                         'ENSG00000172179.11',
                         'ENSG00000259384.6',
                         'ENSG00000204414.11',
                         'ENSG00000279857.1',
                         'ENSG00000135346.8',
                         'ENSG00000164822.4',
                         'ENSG00000164816.7']
top_lowvariance_genes = ['ENSG00000262526.2',
                         'ENSG00000131143.8',
                         'ENSG00000173812.10',
                         'ENSG00000104904.12',
                         'ENSG00000170315.13',
                         'ENSG00000105185.11',
                         'ENSG00000123349.13',
                         'ENSG00000156976.15',
                         'ENSG00000197746.13',
                         'ENSG00000172757.12']
top_tissues = ['Muscle - Skeletal',
             'Skin - Sun Exposed (Lower leg)',
             'Adipose - Subcutaneous',
             'Lung',
             'Artery - Tibial',
             'Thyroid',
             'Nerve - Tibial',
             'Esophagus - Mucosa',
             'Cells - Transformed fibroblasts',
             'Esophagus - Muscularis',
             'Heart - Left Ventricle',
             'Skin - Not Sun Exposed (Suprapubic)',
             'Artery - Aorta',
             'Adipose - Visceral (Omentum)']

### 1. Build linear model with PCA (per genes and per samples) model


In [None]:
# load data
y_targets_h5 = "/s/project/rep/processed/gtex/input_data/Y_targets_pc_onlyblood.h5"
x_inputs_h5 = "/s/project/rep/processed/gtex/input_data/X_inputs_pc_onlyblood.h5"
train_dataset, valid_dataset = d.rep_blood_expression(x_inputs_h5, y_targets_h5, label=None)

metadata_samples_train, metadata_samples_valid = train_dataset.metadata, valid_dataset.metadata
features_train, features_valid = train_dataset.features, valid_dataset.features
x_train, y_train = train_dataset.inputs, train_dataset.targets
x_valid, y_valid = valid_dataset.inputs, valid_dataset.targets

<br/>
Compute $P$ = PCA individual effect (all individuals in train and valid in blood)<br/>
    - input $X$: dim($X$) = #indiv x #genes (only blood)<br/>
    - output $P$: dim($P$) = #indiv x #n_comp<br/>

In [None]:
# load gtex data
features_file = "/s/project/rep/processed/gtex/recount/recount_gtex_logratios.h5ad"
train_individuals_file = "/s/project/rep/processed/gtex/recount/train_individuals.txt"
train_individuals = prep.read_csv_one_column(train_individuals_file)

valid_individuals_file = "/s/project/rep/processed/gtex/recount/valid_individuals.txt"
valid_individuals = prep.read_csv_one_column(valid_individuals_file)

gtex = prep.RepAnnData.read_h5ad(features_file)
gtex_filtered = gtex[gtex.samples['Individual'].isin(train_individuals + valid_individuals)]
gtex_filtered_train = gtex[gtex.samples['Individual'].isin(train_individuals)]
del gtex

In [None]:
# compute PCA
def pca_individual_effect(gtex, n_comp = 10, tissue = 'Whole Blood', pca_type = PCA):
    n_components = 200
    t = 'Whole Blood'
    features = gtex[gtex.obs['Tissue'] == tissue].X
    print(features.shape)
    if features.shape[0] >= n_comp:
        features_centered  = StandardScaler().fit_transform(features)
        pca = pca_type(n_components=n_comp)
        features_pca = pca.fit_transform(features_centered)
    return pca, features_pca


%time pca_individual_effect_model, P = pca_individual_effect(gtex_filtered)

In [None]:
pca_individual_effect_model.components_.shape, P.shape

<br/>
Compute  $Q$  = PCA gene effect (all individuals in train and valid in blood) <br/>
    - input  $X^T$ : dim( $X^T$ ) = #genes x #samples (training set, all tissues) <br/>
    - output  $Q$ : dim( $Q$ ) = #genes x #n_comp <br/>

In [None]:
# compute PCA
def pca_gene_effect(gtex, n_comp = 10, pca_type = PCA):
    n_components = 200
    features = gtex.X.transpose()
    print(features.shape)
    if features.shape[0] >= n_comp:
        features_centered  = StandardScaler().fit_transform(features)
        pca = pca_type(n_components=n_comp)
        features_pca = pca.fit_transform(features_centered)
    return pca, features_pca

%time pca_gene_effect_model, Q = pca_gene_effect(gtex_filtered_train)

In [None]:
pca_gene_effect_model.components_.shape, Q.shape

In [None]:
features_centered  = StandardScaler().fit_transform(gtex_filtered[gtex_filtered.obs['Tissue'] == 'Whole Blood'].X)
features_pca = pca_individual_effect_model.transform(features_centered)

In [None]:
features_pca.shape

In [None]:
# Normalize Data
tissue = 'Muscle - Skeletal'

index_train = np.where(metadata_samples_train['To_tissue'] == tissue)[0]
index_valid = np.where(metadata_samples_valid['To_tissue'] == tissue)[0]

st_x = StandardScaler()
xs_train = st_x.fit_transform(x_train[index_train,:])
xs_valid = st_x.transform(x_valid[index_valid,:])

st_y = StandardScaler()
ys_train = st_y.fit_transform(y_train[index_train,:])
ys_valid = st_y.transform(y_valid[index_valid,:])

xs_train.shape, xs_valid.shape, ys_train.shape, ys_valid.shape

In [None]:
# from random import seed
from random import random
seed(1)

lm_genes = []

mu, sigma = 0, 1
s = np.random.normal(mu, sigma, 1000)

# dim reduction
xs_pca = pca_individual_effect_model.transform(xs_train)
beta_zero = np.ones(xs_pca.shape[0])
y_old = 0
  
# the target is gene i 
y = ys_train[:,100]

# error in the estimation
epsilon = []
for i in range(0, xs_pca.shape[0]): epsilon.append(random())

# fit regression for individual effect
X = np.c_[np.ones(xs_pca.shape[0]), xs_pca, epsilon]   
reg_indiv = LinearRegression().fit(X, y)
y_new = reg_indiv.predict(X)
plt.scatter(y,y_new)

beta_zero, alphas =  reg_indiv.coef_[:1], reg_indiv.coef_[1:-1]

# # fit regression for gene effect
# X = Q.transpose() # 
# reg_gene = LinearRegression().fit(X, y - y_new)
# y_old = reg_gene.predict(X) + y_new
# betas = reg_gene.coef_
# print(reg_gene.coef_.shape)    