# Test DESeq2 normalization function

In [2]:
import os
# set the working directory
os.chdir("/Volumes/TucciSSD/Bioinformatics/workspace/other/DESeq2_counts_scaling")

import pandas as pd
from src.deseq2_norm import DESeq2


## Load Data

In [4]:
# Load the raw counts
counts = pd.read_csv("data/simulated_counts.csv", index_col=0)
# transpose the counts matrix (scikit-learn expects samples as rows)
counts = counts.T
counts.head()

Unnamed: 0,Gene1,Gene2,Gene3,Gene4,Gene5,Gene6,Gene7,Gene8,Gene9,Gene10,...,Gene1991,Gene1992,Gene1993,Gene1994,Gene1995,Gene1996,Gene1997,Gene1998,Gene1999,Gene2000
Sample1,9706,7126,10350,5175,12208,4150,3281,7054,7117,7235,...,1761,1210,15021,479,4678,1810,7551,2806,4466,1682
Sample2,39508,28960,42201,21008,49160,17230,13320,28572,28582,29179,...,7158,5060,60305,1904,18640,7720,31819,11500,18136,7052
Sample3,21334,15245,22384,11284,26287,9297,7239,15080,15266,15668,...,3897,2722,32658,1092,10160,4150,16757,6165,9826,3817
Sample4,32436,23769,34398,17331,40203,14304,10859,23336,23306,24309,...,5872,4089,49910,1578,15573,6447,25919,9489,15117,5805
Sample5,18997,13838,20100,10199,23418,8442,6369,13689,13753,14431,...,3525,2472,29217,969,8928,3691,15211,5482,8977,3473


In [5]:
# Split the matrix: first 10 samples for training, last 10 for test
X_train = counts.iloc[:10,:]   # first 10 rows
X_test  = counts.iloc[10:20,:] # next 10 rows

In [None]:
# Instantiate your DESeq2 normaliser and ask it to keep pandas output
deseq = DESeq2().set_output(transform="pandas")
# Fit only on the training samples and transform them
X_train_norm = deseq.fit_transform(X_train)
deseq.get_size_factors(X_train)
# verify that the size factors are the same as in the original DESeq2 implementation

array([0.51776128, 2.10103797, 1.12776855, 1.72755235, 1.01045239,
       1.26421802, 0.61856166, 0.58020872, 1.73392221, 0.59357909])

In [45]:
# load precomputed size factors
expected_size_factors_train = pd.read_csv("data/size_factors_train.csv", index_col=0)["SizeFactor"]
computed_size_factors_train = pd.Series(deseq.get_size_factors(X_train))

# validate that the size factors are the same
pd.testing.assert_series_equal(
	expected_size_factors_train,
	computed_size_factors_train,
	check_names=False,
	check_index=False
)

In [None]:
# check that counts normalisation matches the expected output
expected_norm_counts_train = pd.read_csv("data/norm_counts_train.csv", index_col=0)

pd.testing.assert_frame_equal(
	X_train_norm,
	expected_norm_counts_train.T
)

In [50]:
# Transform (without re-fitting) the held-out test samples
X_test_norm = deseq.transform(X_test)
deseq.get_size_factors(X_test)
# verifyed that the size factors are the same as in the original DESeq2 implementation

array([1.06083179, 0.22047784, 1.84089125, 1.83176398, 2.07821925,
       2.06181803, 1.38542057, 0.72570834, 1.46703069, 1.24041424])

In [51]:
# load precomputed size factors
expected_size_factors_test = pd.read_csv("data/size_factors_test.csv", index_col=0)["SizeFactor"]
computed_size_factors_test = pd.Series(deseq.get_size_factors(X_test))

# validate that the size factors are the same
pd.testing.assert_series_equal(
	expected_size_factors_test,
	computed_size_factors_test,
	check_names=False,
	check_index=False
)

In [52]:
# check that counts normalisation matches the expected output
expected_norm_counts_test = pd.read_csv("data/norm_counts_test.csv", index_col=0)

pd.testing.assert_frame_equal(
	X_test_norm,
	expected_norm_counts_test.T
)