# Removing correlational structure in RNAseq data

This "shuffled" dataset can be used as a negative control. These data should have no predictive ability or underlying features.

Shuffling was performed by permuting all sample gene expression vectors independently and by shuffling sample IDs.

In [1]:
import os
from random import sample
import numpy as np
import pandas as pd

In [2]:
np.random.seed(123)

In [3]:
def shuffle_columns(gene):
    return np.random.permutation(gene.tolist())

In [4]:
rna_file = 'https://github.com/greenelab/tybalt/raw/b68264f5ecffd8a8b2adf2bfc6ae01a340a90b4e/data/pancan_scaled_rnaseq.tsv.gz'
rnaseq_df = pd.read_table(rna_file, index_col=0)
rnaseq_df.head(2)

Unnamed: 0,RPS4Y1,XIST,KRT5,AGR2,CEACAM5,KRT6A,KRT14,CEACAM6,DDX3Y,KDM5D,...,FAM129A,C8orf48,CDK5R1,FAM81A,C13orf18,GDPD3,SMAGP,C2orf85,POU5F1B,CHST2
TCGA-02-0047-01,0.898017,-0.332348,-1.089849,-1.387105,-0.982601,-0.623583,-0.891389,-1.109676,1.100101,1.032201,...,-0.460255,0.429706,1.479864,0.848098,0.882722,-0.904272,-1.054615,2.056113,-0.967458,1.123911
TCGA-02-0055-01,-0.504287,0.900005,-0.585813,-1.387105,-0.982601,-0.565511,-0.820424,-0.91922,-0.756125,-0.608233,...,0.721747,0.039802,0.442817,0.644982,1.09682,-0.333884,0.840735,0.855049,-1.021807,0.873719


In [5]:
# Shuffle sample IDs
shuffled_sample_id = sample(rnaseq_df.index.tolist(), rnaseq_df.shape[0])

In [6]:
rnaseq_shuffled_df = rnaseq_df.apply(shuffle_columns, axis=1)
rnaseq_shuffled_df.index = shuffled_sample_id

In [7]:
output_file = os.path.join('data', 'pancan_scaled_rnaseq_shuffled_ids.tsv.gz')
rnaseq_shuffled_df.to_csv(output_file, sep='\t', compression='gzip')