In [3]:
import numpy as np
from multiprocessing import Pool

def fill_missing_with_row_means(data):
    # Calculate means of rows ignoring NaNs
    row_means = np.nanmean(data, axis=1)
    # Find indices where NaN values are
    inds = np.where(np.isnan(data))
    # Replace NaNs with the mean of the respective row
    data[inds] = np.take(row_means, inds[0])
    return data

def np_pearson_cor(x, y, yv, yvss):
    # Derivation see: https://cancerdatascience.org/blog/posts/pearson-correlation/
    if yv is None or yvss is None:
        yv = y - y.mean(axis=1, keepdims=True)
        # yvss = (yv * yv).sum(axis=1)
        yvss = np.einsum('ij,ij->i', yv, yv)  # Memory-efficient sum of squares
    xv = x - x.mean(axis=1, keepdims=True)
    xvss = np.einsum('ij,ij->i', xv, xv)  # Memory-efficient sum of squares
    # Use einsum for memory-efficient matrix multiplication
    # result = np.matmul(xv, yv.T) / np.sqrt(np.outer(xvss, yvss))
    result = np.einsum('ij,kj->ik', xv, yv) / np.sqrt(xvss[:, np.newaxis] * yvss[np.newaxis, :])

    # Limit the result to the range [-1, 1]
    np.clip(result, -1.0, 1.0, out=result)
    return result    

def correlation_matrix_by_rows(data, chunk_size, num_processes=None, return_upper_triangle=True):
    n_rows = data.shape[0]
    correlation_matrix = np.zeros((n_rows, n_rows))

    print("Performing mean imputation for missing values ...")
    data = fill_missing_with_row_means(data)

    print("Precompute matrix quantities ...")
    yv = data - data.mean(axis=1, keepdims=True)
    yvss = np.einsum('ij,ij->i', yv, yv)  # Memory-efficient sum of squares

    if num_processes is None or num_processes <= 1:
        # Non-parallel execution
        for start_row in range(0, n_rows, chunk_size):
            end_row = min(start_row + chunk_size, n_rows)
            print("Working on rows", start_row, "to", end_row, "out of", n_rows, "rows")
            correlation_matrix[start_row:end_row, :] = np_pearson_cor(data[start_row:end_row], data, yv, yvss)
    else:
        # Parallel execution
        row_ranges = [(start, min(start + chunk_size, n_rows)) for start in range(0, n_rows, chunk_size)]
        with Pool(processes=num_processes) as pool:
            for idx, (start, end) in enumerate(row_ranges):
                correlation_matrix[start:end, :] = pool.apply(correlation_chunk, (start, end, data, yv, yvss))

    # Mirror the upper triangle to the lower triangle
    if not return_upper_triangle:
        i_lower = np.tril_indices(n_rows, -1)
        correlation_matrix[i_lower] = correlation_matrix.T[i_lower]
    return correlation_matrix

In [7]:
import time

def test_correlation_matrix_by_rows():
    # compare between customized iplementation and np.corr
    data_matrix = np.random.rand(1000, 200)  # Example large data matrix
    chunk_size = 100  # Define chunk size
    num_processes = 0  # Number of parallel processes
    time_custom = time.time()
    correlation_matrix_custom = correlation_matrix_by_rows(data_matrix, chunk_size, num_processes, False)
    time_custom = time.time() - time_custom
    print(time_custom)
    time_np = time.time()
    correlation_matrix_np = np.corrcoef(data_matrix, rowvar=True)
    time_np = time.time() - time_np
    print(time_np)
    accuracy = np.allclose(correlation_matrix_custom, correlation_matrix_np)
    assert accuracy == True
test_correlation_matrix_by_rows()

Performing mean imputation for missing values ...
Precompute matrix quantities ...
Working on rows 0 to 100
Working on rows 100 to 200
Working on rows 200 to 300
Working on rows 300 to 400
Working on rows 400 to 500
Working on rows 500 to 600
Working on rows 600 to 700
Working on rows 700 to 800
Working on rows 800 to 900
Working on rows 900 to 1000
0.15220069885253906
0.01983809471130371


In [None]:
def benchmark_correlation_matrix_by_rows():
    # 17K samples, pair-wise LD of 100,000 variants 
    data_matrix = np.random.rand(100000, 17000)  # Example large data matrix
    chunk_size = 10000  # Define chunk size
    num_processes = 0  # Number of parallel processes
    time_custom = time.time()
    correlation_matrix_custom = correlation_matrix_by_rows(data_matrix, chunk_size, num_processes)
    time_custom = time.time() - time_custom
    print(time_custom)
benchmark_correlation_matrix_by_rows()

Performing mean imputation for missing values ...
Precompute matrix quantities ...
Working on rows 0 to 10000


In [None]:
def benchmark_np_correlation():
    # 17K samples, pair-wise LD of 100,000 variants 
    data_matrix = np.random.rand(100000, 17000)  # Example large data matrix
    time_np = time.time()
    correlation_matrix_np = np.corrcoef(data_matrix, rowvar=True)
    time_np = time.time() - time_np
    print(time_np)
benchmark_np_correlation()