# CorALS - Quickstart

## Prepare parallelization

Before running anything, we make sure that `numpy` will not  oversubscribe CPUs and slow things down.
Note that this has to be executed **before importing `numpy`**.

* For full correlation matrix calculation, setting `n_threads > 1` can be used to parallelize the calculation.
* For the top-k approaches, setting `n_threads=1` makes the most sense, since parallelization is specified separately.

In [1]:
import pandas as pd
from corals.threads import set_threads_for_external_libraries
set_threads_for_external_libraries(n_threads=1)





In [65]:
sample_gene_matrix = pd.read_csv('brca_signature/brca_pancan_mrna.csv', index_col=0)

sig_name = "BRCA-EAS"
# Load the sample groups
sample_groups = pd.read_csv(f'./brca_signature/spl_cor_brca.csv.csv', index_col=0) 
# Load the gene groups
gene_groups = pd.read_csv(f'./brca_signature/sig_cor_brca_top1000.csv', index_col=0) 
gene_groups = gene_groups.iloc[0:50,:]

In [66]:
group = "BRCA1104"
group_samples = sample_groups.index
group_genes = gene_groups.index
# Get the intersection of available samples and genes
available_samples = group_samples.intersection(sample_gene_matrix.columns)
available_genes = group_genes.intersection(sample_gene_matrix.index)
# Extract the corresponding subset of the sample-gene matrix data
group_data = sample_gene_matrix.loc[available_genes, available_samples].transpose()
group_data = group_data.fillna(0)

## FULL CORR

In [67]:
# df to array
X = group_data.to_numpy()
# reusing correlation from the top-k example
# runtime: ~5 sec with `n_jobs=8`


# FULL
from corals.correlation.full.default import cor_full
cor_values = cor_full(X)
cor_full = pd.DataFrame(cor_values, columns = group_data.columns)
cor_full.index = group_data.columns
# Save the extracted data to a new CSV file
filename = f"brca_signature/corfull_{sig_name}_{group}.csv"
cor_full.to_csv(filename)



In [68]:
cor_full.shape

(49, 49)

## TOP-K

In [64]:
# TOP K
n_samples = X.shape[0]
n_features = X.shape[1]
print(n_samples)
print(n_features)

from corals.correlation.topk.default import cor_topk
cor_topk_values, cor_topk_coo = cor_topk(X, correlation_type="spearman", k=0.1, n_jobs=8)

from corals.correlation.utils import derive_pvalues, multiple_test_correction
# calculate p-values
pvalues = derive_pvalues(cor_topk_values, n_samples)



cor = pd.DataFrame(cor_topk_values, columns=['cor'])
p = pd.DataFrame(pvalues, columns=['pvalue'])
coo = pd.DataFrame(cor_topk_coo).transpose()
pair = pd.DataFrame({
    'start': group_data.columns[cor_topk_coo[0]],
    'end': group_data.columns[cor_topk_coo[1]]
})
# concatenate the two dataframes horizontally
df_concat = pd.concat([coo, pair, cor, p], axis=1)
df_concat = df_concat.loc[df_concat['cor'] != 1]
# Save the extracted data to a new CSV file


# Save the extracted data to a new CSV file
filename = f"brca_signature/topk_{sig_name}_{group}.csv"
df_concat.to_csv(filename)






1104
928


  ts = rf * rf * (df / (1 - rf * rf))
