### Goal

Test new ideas of implementing metrics

### TODO
- Imagine genes as a time series - compute the fourier transform and apply pearson corr.

In [3]:
# code autoreload
%load_ext autoreload
%autoreload 2
import os
import sys

import random
import math
import numpy.random as nr
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats as scistats

import scanpy.api as sc
import anndata as ad
import xarray as xr
import dask.array as da

## init plotly
# from plotly.offline import iplot, init_notebook_mode
# init_notebook_mode(connected=True)
import plotly.io as pio
pio.renderers.default = 'iframe_connected'
import plotly.graph_objs as go

import sklearn



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
sys.path.append(os.path.expanduser("~/Projects/REP/rep"))
import rep.random as rnd

In [5]:
CACHE_DIR="/s/project/rep/cache/"
RAW_DATA_DIR="/s/project/rep/raw/"
PROCESSED_DATA_DIR="/s/project/rep/processed/"

In [None]:
l2fc = ad.read_h5ad(os.path.join(PROCESSED_DATA_DIR, "gtex/OUTRIDER/l2fc.h5ad"))
l2fc = l2fc[:, ~ np.any(np.isnan(l2fc.X), axis=0)]

# calculate PCA
sc.pp.pca(l2fc)

l2fc

In [5]:
def calc_corrs(data, obs=None, method="pearson"):
    if obs is not None:
        coords = {
            "observations": xr.DataArray(obs.index, dims=("observations", )),
            **obs.to_xarray().rename_dims({"index":"observations"}),
        }
    else:
        coords = None
    
    xrds = xr.DataArray(
        data,
        dims=("observations", "features"),
        coords=coords
    )
    
    if str(method).lower() == "pearson":
        corrs = np.corrcoef(xrds)
    elif str(method).lower() == "spearman":
        corrs = scistats.spearmanr(xrds, axis=1).correlation
    else:
        raise ValueError("unknown correlation method %s" % str(method))
    
    corrs_xr = xr.DataArray(
        corrs,
        dims=("observations_1", "observations_2"),
        coords={
            **{
                key + "_1": (("observations_1",), vals.values) for key, vals in xrds.coords.items() if vals.dims == ("observations",)
            },
            **{
                key + "_2": (("observations_2",), vals.values) for key, vals in xrds.coords.items() if vals.dims == ("observations",)
            },
        }
    )
    return corrs_xr

In [7]:
corrs = calc_corrs(l2fc.X, l2fc.obs, method="spearman")
corrs

<xarray.DataArray (observations_1: 8166, observations_2: 8166)>
array([[ 1.000000e+00, -8.233220e-03,  5.517639e-02, ...,  3.126127e-02,
         1.187960e-02,  2.101145e-04],
       [-8.233220e-03,  1.000000e+00, -3.893159e-02, ..., -2.439356e-03,
        -1.951421e-02,  2.858393e-02],
       [ 5.517639e-02, -3.893159e-02,  1.000000e+00, ...,  1.858253e-02,
        -1.573536e-02, -1.305164e-02],
       ...,
       [ 3.126127e-02, -2.439356e-03,  1.858253e-02, ...,  1.000000e+00,
        -7.934143e-02,  5.313669e-03],
       [ 1.187960e-02, -1.951421e-02, -1.573536e-02, ..., -7.934143e-02,
         1.000000e+00,  2.615297e-02],
       [ 2.101145e-04,  2.858393e-02, -1.305164e-02, ...,  5.313669e-03,
         2.615297e-02,  1.000000e+00]])
Coordinates:
  * observations_1     (observations_1) object 'GTEX-111CU-1826-SM-5GZYN' ... 'GTEX-ZXG5-0005-SM-57WCN'
    individual_1       (observations_1) object 'GTEX-111CU' ... 'GTEX-ZXG5'
    SMATSSCR_1         (observations_1) float64 0.0 2.0 1.

In [8]:
corrs_pca = calc_corrs(l2fc.obsm["X_pca"], l2fc.obs)
corrs_pca

<xarray.DataArray (observations_1: 8166, observations_2: 8166)>
array([[ 1.000000e+00,  1.897561e-01, -2.154363e-01, ...,  4.721927e-02,
         1.577282e-01,  6.047217e-02],
       [ 1.897561e-01,  1.000000e+00, -1.134831e-02, ...,  1.382194e-01,
        -1.712073e-01, -1.682804e-01],
       [-2.154363e-01, -1.134831e-02,  1.000000e+00, ...,  2.255154e-01,
        -4.201826e-01, -2.112321e-02],
       ...,
       [ 4.721927e-02,  1.382194e-01,  2.255154e-01, ...,  1.000000e+00,
        -6.722392e-04,  2.185299e-01],
       [ 1.577282e-01, -1.712073e-01, -4.201826e-01, ..., -6.722392e-04,
         1.000000e+00,  4.920097e-01],
       [ 6.047217e-02, -1.682804e-01, -2.112321e-02, ...,  2.185299e-01,
         4.920097e-01,  1.000000e+00]])
Coordinates:
  * observations_1     (observations_1) object 'GTEX-111CU-1826-SM-5GZYN' ... 'GTEX-ZXG5-0005-SM-57WCN'
    individual_1       (observations_1) object 'GTEX-111CU' ... 'GTEX-ZXG5'
    SMATSSCR_1         (observations_1) float64 0.0 2.0 1.

In [9]:
np.unique(corrs.subtissue_1)

array(['Adipose_Subcutaneous', 'Adipose_Visceral_Omentum',
       'Adrenal_Gland', 'Artery_Aorta', 'Artery_Coronary',
       'Artery_Tibial', 'Brain_Anterior_cingulate_cortex_BA24',
       'Brain_Caudate_basal_ganglia', 'Brain_Cerebellar_Hemisphere',
       'Brain_Cerebellum', 'Brain_Cortex', 'Brain_Frontal_Cortex_BA9',
       'Brain_Hippocampus', 'Brain_Hypothalamus',
       'Brain_Nucleus_accumbens_basal_ganglia',
       'Brain_Putamen_basal_ganglia', 'Breast_Mammary_Tissue',
       'Cells_EBV_transformed_lymphocytes',
       'Cells_Transformed_fibroblasts', 'Colon_Sigmoid',
       'Colon_Transverse', 'Esophagus_Gastroesophageal_Junction',
       'Esophagus_Mucosa', 'Esophagus_Muscularis',
       'Heart_Atrial_Appendage', 'Heart_Left_Ventricle', 'Liver', 'Lung',
       'Muscle_Skeletal', 'Nerve_Tibial', 'Ovary', 'Pancreas',
       'Pituitary', 'Prostate', 'Skin_Not_Sun_Exposed_Suprapubic',
       'Skin_Sun_Exposed_Lower_leg', 'Small_Intestine_Terminal_Ileum',
       'Spleen', 'Stomac

In [10]:
np.unique(corrs.tissue_1)

array(['Adipose Tissue', 'Adrenal Gland', 'Blood', 'Blood Vessel',
       'Brain', 'Breast', 'Colon', 'Esophagus', 'Heart', 'Liver', 'Lung',
       'Muscle', 'Nerve', 'Ovary', 'Pancreas', 'Pituitary', 'Prostate',
       'Skin', 'Small Intestine', 'Spleen', 'Stomach', 'Testis',
       'Thyroid', 'Uterus', 'Vagina', 'nan'], dtype=object)

In [11]:
sel = corrs_pca.sel(observations_1=(corrs_pca.tissue_1 == "Blood")) #, observations_2=(corrs_pca.tissue_2 == "Brain")))
sel = sel.sel(observations_2=np.isin(sel.individual_2, sel.individual_1))
sel = sel.sel(observations_1=np.isin(sel.individual_1, sel.individual_2))
#sel = np.abs(corrs_pca)
sel = sel.sortby(["individual_1", "individual_2"])
#sel = pd.DataFrame(sel.values, columns=sel.observations_2.values, index=sel.observations_1.values)

trace = go.Heatmap(
    z=sel.values,
    x=sel.observations_2,
    y=sel.observations_1,
    colorscale='RdBu',
    zmin=-1,
    zmax=1
)
layout = go.Layout(
    xaxis=dict(
        automargin=True,
    ),
    yaxis=dict(
        automargin=True,
    ),
    width=1500,
    height=1500
)

fig = go.Figure(data=[trace], layout=layout)

pio.show(fig)

#fig, ax = plt.subplots(figsize=(25,25))
#sns.heatmap(sel)

In [12]:
sel = corrs.sel(observations_1=(corrs.tissue_1 == "Blood")) #, observations_2=(corrs_pca.tissue_2 == "Brain"))
sel = sel.sel(observations_2=np.isin(sel.individual_2, sel.individual_1))
sel = sel.sel(observations_1=np.isin(sel.individual_1, sel.individual_2))
sel = sel.sortby(["individual_1", "individual_2"])
#sel = pd.DataFrame(sel.values, columns=sel.observations_2.values, index=sel.observations_1.values)

trace = go.Heatmap(
    z=sel.values,
    x=sel.observations_2,
    y=sel.observations_1,
    colorscale='RdBu',
    zmin=-1,
    zmax=1
)
layout = go.Layout(
    xaxis=dict(
        automargin=True,
    ),
    yaxis=dict(
        automargin=True,
    ),
    width=1500,
    height=1500
)

fig = go.Figure(data=[trace], layout=layout)

pio.show(fig)

#fig, ax = plt.subplots(figsize=(25,25))
#sns.heatmap(sel)

In [13]:
sel = corrs.sel(observations_1=(corrs.tissue_1 == "Blood"), observations_2=(corrs.tissue_2 == "Lung"))
sel = sel.sel(observations_2=np.isin(sel.individual_2, sel.individual_1))
sel = sel.sel(observations_1=np.isin(sel.individual_1, sel.individual_2))
sel = sel.sortby(["individual_1", "individual_2"])
#sel = pd.DataFrame(sel.values, columns=sel.observations_2.values, index=sel.observations_1.values)

trace = go.Heatmap(
    z=sel.values,
    x=sel.observations_2,
    y=sel.observations_1,
    colorscale='RdBu',
    zmin=-1,
    zmax=1
)
layout = go.Layout(
    xaxis=dict(
        automargin=True,
    ),
    yaxis=dict(
        automargin=True,
    ),
    width=1500,
    height=1500,
)

fig = go.Figure(data=[trace], layout=layout)

pio.show(fig)

#fig, ax = plt.subplots(figsize=(25,25))
#sns.heatmap(sel)

In [14]:
x = l2fc.obs.query("(individual == 'GTEX-UPIC') & (tissue == 'Blood')")

sns.scatterplot(x=x.X[0], y=x.X[1])

AttributeError: 'DataFrame' object has no attribute 'X'

In [None]:
amax_sel = np.abs(sel).argmax(dim="observations_2")
np.abs(sel).max(dim="observations_2")

In [None]:
best_corr = pd.DataFrame({
    "obs_1" : sel.observations_1.values, 
    "obs_2": sel.observations_2[amax_sel].values, 
    "corr": sel.isel(observations_2 = amax_sel)
})
best_corr.head()

In [None]:
best_corr_df = sel.isel(observations_2 = amax_sel).to_dataframe("corr")
best_corr_df.head()

In [None]:
np.size(np.where(best_corr_df.individual_1 == best_corr_df.individual_2)) / best_corr_df.shape[0]

In [None]:
best_corr_df[best_corr_df.individual_1 == best_corr_df.individual_2][["observations_2", "corr"]].head()

In [None]:
x = xrds.sel(observations="GTEX-111YS-0006-SM-5NQBE")
y = xrds.sel(observations="GTEX-111YS-0626-SM-5GZXV")


In [None]:
sc.pl.scatter(raw_data, x="GTEX-111YS-0006-SM-5NQBE", y="GTEX-111YS-0626-SM-5GZXV")

In [None]:
sel

In [19]:
tissue = blood
other_tissue = lung

tissue_df = pd.DataFrame(tissue.values, columns=tissue.features, index=tissue.observations)
other_tissue_df = pd.DataFrame(other_tissue.values, columns=other_tissue.features, index=other_tissue.observations)


In [23]:
tissue_df.T.corr()

observations,GTEX-111YS-0006-SM-5NQBE,GTEX-1122O-0005-SM-5O99J,GTEX-113IC-0006-SM-5NQ9C,GTEX-113JC-0006-SM-5O997,GTEX-117YW-0005-SM-5NQ8Z,GTEX-1192W-0005-SM-5NQBQ,GTEX-1192X-0005-SM-5NQC3,GTEX-11DXW-0006-SM-5NQ7Y,GTEX-11DXX-0005-SM-5NQ8B,GTEX-11DXY-0006-SM-5NQ8N,...,GTEX-ZUA1-0005-SM-4YCEV,GTEX-ZV68-0006-SM-4YCEJ,GTEX-ZV7C-0005-SM-57WDL,GTEX-ZVE2-0006-SM-51MRW,GTEX-ZVT2-0005-SM-57WBW,GTEX-ZVT3-0006-SM-51MT9,GTEX-ZVT4-0006-SM-57WB8,GTEX-ZVZP-0006-SM-51MSW,GTEX-ZVZQ-0006-SM-51MR8,GTEX-ZXG5-0005-SM-57WCN
observations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GTEX-111YS-0006-SM-5NQBE,1.000000,0.997549,0.996510,-0.996208,-0.250057,-0.788917,-0.979496,0.995768,0.998193,0.507571,...,0.877945,0.996486,0.997558,-0.946394,0.994376,-0.911995,0.993767,0.998994,-0.174626,0.996966
GTEX-1122O-0005-SM-5O99J,0.997549,1.000000,0.998850,-0.991120,-0.198918,-0.756657,-0.989199,0.998670,0.993694,0.556735,...,0.855720,0.995318,0.999836,-0.962796,0.996341,-0.888995,0.997304,0.998935,-0.119488,0.999205
GTEX-113IC-0006-SM-5NQ9C,0.996510,0.998850,1.000000,-0.991401,-0.201192,-0.758150,-0.988201,0.999342,0.992089,0.558988,...,0.862021,0.996720,0.998646,-0.960648,0.994275,-0.886801,0.996352,0.997232,-0.116170,0.999519
GTEX-113JC-0006-SM-5O997,-0.996208,-0.991120,-0.991401,1.000000,0.319444,0.832422,0.963064,-0.990082,-0.998175,-0.449934,...,-0.912755,-0.997614,-0.990674,0.921653,-0.988131,0.936902,-0.986593,-0.993075,0.235267,-0.992007
GTEX-117YW-0005-SM-5NQ8Z,-0.250057,-0.198918,-0.201192,0.319444,1.000000,0.789098,0.056925,-0.190686,-0.297514,0.674204,...,-0.648148,-0.274305,-0.194721,-0.054685,-0.221481,0.617737,-0.177627,-0.216567,0.951077,-0.204639
GTEX-1192W-0005-SM-5NQBQ,-0.788917,-0.756657,-0.758150,0.832422,0.789098,1.000000,0.656029,-0.751576,-0.818565,0.098455,...,-0.968453,-0.805465,-0.753738,0.565870,-0.768706,0.965259,-0.741901,-0.767830,0.697853,-0.760673
GTEX-1192X-0005-SM-5NQC3,-0.979496,-0.989199,-0.988201,0.963064,0.056925,0.656029,1.000000,-0.989704,-0.968838,-0.667660,...,-0.776146,-0.974053,-0.989682,0.990738,-0.982363,0.814773,-0.990582,-0.985982,-0.022290,-0.988002
GTEX-11DXW-0006-SM-5NQ7Y,0.995768,0.998670,0.999342,-0.990082,-0.190686,-0.751576,-0.989704,1.000000,0.991047,0.567665,...,0.857503,0.995867,0.998572,-0.963422,0.992964,-0.882390,0.998006,0.997185,-0.103032,0.999612
GTEX-11DXX-0005-SM-5NQ8B,0.998193,0.993694,0.992089,-0.998175,-0.297514,-0.818565,-0.968838,0.991047,1.000000,0.466096,...,0.896313,0.995966,0.993495,-0.931682,0.992525,-0.931145,0.989144,0.996232,-0.218683,0.993050
GTEX-11DXY-0006-SM-5NQ8N,0.507571,0.556735,0.558988,-0.449934,0.674204,0.098455,-0.667660,0.567665,0.466096,1.000000,...,0.101339,0.498071,0.558693,-0.747927,0.540145,-0.125962,0.575404,0.536226,0.739635,0.555544


In [103]:
corrs.individual_1 == corrs.individual_2

<xarray.DataArray (observations_1: 6766, observations_2: 6766)>
array([[ True, False, False, ..., False, False, False],
       [False,  True, False, ..., False, False, False],
       [False, False,  True, ..., False, False, False],
       ...,
       [False, False, False, ...,  True, False, False],
       [False, False, False, ..., False,  True, False],
       [False, False, False, ..., False, False,  True]])
Coordinates:
  * observations_1      (observations_1) object 'GTEX-111CU-1826-SM-5GZYN' ... 'GTEX-ZXG5-0005-SM-57WCN'
    RNA.Seq_1           (observations_1) object 'SRR1310275' ... 'SRR1382485'
    SRA_Sample_1        (observations_1) object 'SRS623944' ... 'SRS635146'
    sex_1               (observations_1) object 'male' 'male' ... 'male'
    BioSample_1         (observations_1) object 'SAMN02789465' ... 'SAMN02791282'
    Experiment_1        (observations_1) object 'SRX558418' ... 'SRX591953'
    tissue_1            (observations_1) object 'Adipose Tissue' ... 'Blood'
    sub