In [7]:
# export

import numpy as np
from multiprocessing import Pool
import time

def np_pearson_cor(x, y, yv=None, yvss=None):
    # Derivation see: https://cancerdatascience.org/blog/posts/pearson-correlation/
    if yv is None or yvss is None:
        yv = y - y.mean(axis=1, keepdims=True)
        yvss = (yv * yv).sum(axis=1)

    xv = x - x.mean(axis=1, keepdims=True)
    xvss = (xv * xv).sum(axis=1)
    result = np.matmul(xv, yv.T) / np.sqrt(np.outer(xvss, yvss))
    return np.maximum(np.minimum(result, 1.0), -1.0)

def correlation_chunk(start_row, end_row, data, yv, yvss):
    chunk = data[start_row:end_row]
    return np_pearson_cor(chunk, data, yv, yvss)

def correlation_matrix_by_rows(data, chunk_size, num_processes=None):
    n_rows = data.shape[0]
    correlation_matrix = np.zeros((n_rows, n_rows))

    yv = data - data.mean(axis=1, keepdims=True)
    yvss = (yv * yv).sum(axis=1)

    if num_processes is None or num_processes <= 1:
        # Non-parallel execution
        for start_row in range(0, n_rows, chunk_size):
            end_row = min(start_row + chunk_size, n_rows)
            chunk_corr = np_pearson_cor(data[start_row:end_row], data, yv, yvss)
            correlation_matrix[start_row:end_row, :] = chunk_corr
    else:
        # Parallel execution
        row_ranges = [(start, min(start + chunk_size, n_rows)) for start in range(0, n_rows, chunk_size)]
        with Pool(processes=num_processes) as pool:
            results = pool.starmap(correlation_chunk, [(start, end, data, yv, yvss) for start, end in row_ranges])

        # Assemble the correlation matrix from the results
        for idx, (start, end) in enumerate(row_ranges):
            correlation_matrix[start:end, :] = results[idx]

    # Mirror the upper triangle to the lower triangle
    i_lower = np.tril_indices(n_rows, -1)
    correlation_matrix[i_lower] = correlation_matrix.T[i_lower]

    return correlation_matrix

In [9]:
def test_correlation_matrix_by_rows():
    data_matrix = np.random.rand(1000, 100)  # Example large data matrix
    chunk_size = 100  # Define chunk size
    num_processes = 4  # Number of parallel processes
    start_time_custom_parallel = time.time()
    correlation_matrix_custom = correlation_matrix_by_rows(data_matrix, chunk_size, num_processes)
    correlation_matrix_np = np.corrcoef(data_matrix, rowvar=True)
    accuracy = np.allclose(correlation_matrix_custom, correlation_matrix_np)
    assert accuracy == True
test_correlation_matrix_by_rows()