# Generate a matrix with a given covariance structure

In this notebook, we simulate a matrix with 10,000 samples and 10 features.
We artifically inject two distinct signals into the matrix.

We sample the 10,000 samples from a given covariance matrix.
This covariance matrix specifies two groups of correlated features.

| Group | Correlated Features |
| :---- | :------------------ |
| 1 | 1, 2, 3 |
| 2 | 5, 6, 7 |

The remaining features (4, 8, 9, 10) are random Gaussian noise.
The second group of features has lower correlation than the first group.

In [1]:
suppressPackageStartupMessages(library(dplyr))

“package ‘dplyr’ was built under R version 3.4.4”

In [2]:
set.seed(1234)

In [3]:
n = 10000
p = 10

In [4]:
cov_mat = diag(p)

random_off_diag_structure <- abs(rnorm(n = length(cov_mat[lower.tri(cov_mat)]), mean = 0, sd = 0))

cov_mat[lower.tri(cov_mat)] <- random_off_diag_structure

cov_mat[2, 1] <- 0.95
cov_mat[3, 2] <- 0.90
cov_mat[3, 1] <- 0.93

cov_mat[6, 5] <- 0.90
cov_mat[7, 6] <- 0.85
cov_mat[7, 5] <- 0.88

cov_mat[upper.tri(cov_mat)] <- t(cov_mat)[upper.tri(cov_mat)]

In [5]:
cov_mat

0,1,2,3,4,5,6,7,8,9
1.0,0.95,0.93,0,0.0,0.0,0.0,0,0,0
0.95,1.0,0.9,0,0.0,0.0,0.0,0,0,0
0.93,0.9,1.0,0,0.0,0.0,0.0,0,0,0
0.0,0.0,0.0,1,0.0,0.0,0.0,0,0,0
0.0,0.0,0.0,0,1.0,0.9,0.88,0,0,0
0.0,0.0,0.0,0,0.9,1.0,0.85,0,0,0
0.0,0.0,0.0,0,0.88,0.85,1.0,0,0,0
0.0,0.0,0.0,0,0.0,0.0,0.0,1,0,0
0.0,0.0,0.0,0,0.0,0.0,0.0,0,1,0
0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,1


In [6]:
cov_mat %>%
    dplyr::as_tibble(.name_repair = "minimal")

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9
1.0,0.95,0.93,0,0.0,0.0,0.0,0,0,0
0.95,1.0,0.9,0,0.0,0.0,0.0,0,0,0
0.93,0.9,1.0,0,0.0,0.0,0.0,0,0,0
0.0,0.0,0.0,1,0.0,0.0,0.0,0,0,0
0.0,0.0,0.0,0,1.0,0.9,0.88,0,0,0
0.0,0.0,0.0,0,0.9,1.0,0.85,0,0,0
0.0,0.0,0.0,0,0.88,0.85,1.0,0,0,0
0.0,0.0,0.0,0,0.0,0.0,0.0,1,0,0
0.0,0.0,0.0,0,0.0,0.0,0.0,0,1,0
0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,1


In [7]:
feature_ids <- paste0("feature_", seq(1, nrow(cov_mat)))

cov_mat_df <- cov_mat %>%
    dplyr::as_tibble(.name_repair = "minimal")

colnames(cov_mat_df) <- feature_ids

cov_mat_df <- cov_mat_df %>%
    dplyr::mutate(feature_num = feature_ids) %>%
    dplyr::select(feature_num, dplyr::everything())

out_file <- file.path("data", "simulated_covariance_structure.tsv")
cov_mat_df %>% readr::write_tsv(out_file)

cov_mat_df

feature_num,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10
feature_1,1.0,0.95,0.93,0,0.0,0.0,0.0,0,0,0
feature_2,0.95,1.0,0.9,0,0.0,0.0,0.0,0,0,0
feature_3,0.93,0.9,1.0,0,0.0,0.0,0.0,0,0,0
feature_4,0.0,0.0,0.0,1,0.0,0.0,0.0,0,0,0
feature_5,0.0,0.0,0.0,0,1.0,0.9,0.88,0,0,0
feature_6,0.0,0.0,0.0,0,0.9,1.0,0.85,0,0,0
feature_7,0.0,0.0,0.0,0,0.88,0.85,1.0,0,0,0
feature_8,0.0,0.0,0.0,0,0.0,0.0,0.0,1,0,0
feature_9,0.0,0.0,0.0,0,0.0,0.0,0.0,0,1,0
feature_10,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,1


In [8]:
simulated_data <- MASS::mvrnorm(n = n, mu = rep(0, p), Sigma = cov_mat) 
colnames(simulated_data) <- paste0("feature_", 1:ncol(simulated_data))
simulated_data <- simulated_data %>% dplyr::as_tibble(.name_repair = "minimal")

print(dim(simulated_data))
head(simulated_data)

[1] 10000    10


feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10
1.0286196,1.3146307,1.1901828,0.49060054,-1.486416,-1.5234431,-2.21882023,-0.903147902,2.4918186,-1.6878627
-0.1512017,-0.3485621,-0.3135512,0.02499143,0.56701,0.592104,0.64420043,-0.006098308,0.0532215,-0.9552011
-1.0618205,-0.883511,-1.2288525,1.29905349,0.297267,0.51318971,0.68231753,-0.904131937,0.4562491,-0.6480572
2.2379443,2.6729694,1.9504273,-0.23457321,0.3891368,-0.08028292,0.09300547,-0.060453158,1.5770552,0.2610342
-0.5824264,-0.4616994,-0.208167,-0.45257621,1.5829965,1.56321431,1.03738579,-1.094187464,0.622353,-1.219694
-0.5003933,-0.5806993,-0.3988106,-0.01112573,-0.8851752,-0.26575244,-0.26149642,0.352918538,1.1879753,-1.5501888


In [9]:
out_file <- file.path("data", "simulated_signal_n1000_p10.tsv")
simulated_data %>% readr::write_tsv(out_file)