In [1]:
# Make sure rpy2 is installed
# !pip install rpy2

import pandas as pd
import pyarrow.parquet as pq
from rpy2.robjects import r, pandas2ri
from rpy2.robjects.packages import importr
pandas2ri.activate()


In [2]:
# Load the parquet matrix (cell lines × genes, with integer counts)
df = pd.read_parquet("../../data/original/cell_gene_matrix.parquet")

print("✅ Loaded count matrix:", df.shape)
df.head()


✅ Loaded count matrix: (1362, 37602)


gene_id,SIDG00001,SIDG00002,SIDG00003,SIDG00004,SIDG00005,SIDG00006,SIDG00007,SIDG00008,SIDG00009,SIDG00010,...,SIDG42469,SIDG42470,SIDG42471,SIDG42472,SIDG42473,SIDG42474,SIDG42475,SIDG42479,SIDG42480,SIDG42481
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SIDM00001,0,121,3,38,0,1,0,0,0,0,...,239,0,0,1735,0,3306,2273,688,210,132
SIDM00002,152,799,0,230,75,193,1,2,1,51,...,458,0,12279,9299,0,0,19371,2554,497,97
SIDM00003,124,2038,0,0,117,2,0,0,8,20,...,0,17,15504,0,0,25356,25481,0,734,0
SIDM00005,149,244,1,64,139,17,3,40,3,6,...,203,0,1389,0,0,0,2743,1502,422,51
SIDM00006,121,1167,11,30467,128,4,1,0,0,0,...,0,0,0,101402,0,0,16203,0,521,80


In [3]:
# R expects genes as rows, samples as columns
df_t = df.transpose()

print("🔁 Transposed shape:", df_t.shape)
df_t.head()


🔁 Transposed shape: (37602, 1362)


model_id,SIDM00001,SIDM00002,SIDM00003,SIDM00005,SIDM00006,SIDM00007,SIDM00008,SIDM00009,SIDM00011,SIDM00014,...,SIDM01956,SIDM01957,SIDM01958,SIDM01960,SIDM01961,SIDM01962,SIDM01963,SIDM01964,SIDM01965,SIDM01966
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SIDG00001,0,152,124,149,121,357,0,127,0,22,...,2,0,1,2,0,0,2,2,5,1
SIDG00002,121,799,2038,244,1167,0,2,1886,372,2,...,0,0,4,8,0,5,4,2,24,0
SIDG00003,3,0,0,1,11,0,8,3,4,0,...,79,6,3,342,77,375,333,211,1,346
SIDG00004,38,230,0,64,30467,27,5,17,32961,42,...,0,1,0,9,2,0,2,0,143,1335
SIDG00005,0,75,117,139,128,231,2,8,0,42,...,5,23,17,0,4,5,13,0,30,11


In [4]:
from rpy2.robjects import globalenv

# Import necessary R packages
limma = importr("limma")
edgeR = importr("edgeR")
base = importr("base")

# 1. Convert and assign to R global environment
with pandas2ri.converter.context():
    r_counts = pandas2ri.py2rpy(df_t)

globalenv["r_counts"] = r_counts  # 👈 Make it available inside R

# 2. Run voom transformation
r('dge <- DGEList(counts = r_counts)')
r('dge <- calcNormFactors(dge)')
r('voom_result <- voom(dge, plot=FALSE)')



ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

<rpy2.robjects.vectors.ListVector object at 0x7fbfb2d40f40> [RTYPES.VECSXP]
R classes: ('EList',)
[ListSexpVe..., FloatSexpV..., FloatSexpV..., FloatSexpV...]
  targets: <class 'rpy2.rinterface.ListSexpVector'>
  <rpy2.rinterface.ListSexpVector object at 0x7fbfb4548480> [RTYPES.VECSXP]
  E: <class 'rpy2.rinterface.FloatSexpVector'>
  <rpy2.rinterface.FloatSexpVector object at 0x7fbfb44b29c0> [RTYPES.REALSXP]
  weights: <class 'rpy2.rinterface.FloatSexpVector'>
  <rpy2.rinterface.FloatSexpVector object at 0x7fbfb45480c0> [RTYPES.REALSXP]
  design: <class 'rpy2.rinterface.FloatSexpVector'>
  <rpy2.rinterface.FloatSexpVector object at 0x7fbfb80ae100> [RTYPES.REALSXP]

In [5]:
# Extract the log2 CPM matrix from voom result
voom_matrix = r('voom_result$E')

# Already a NumPy array — no need for rpy2py conversion
voom_matrix = r('voom_result$E')

# Convert to DataFrame
voom_df = pd.DataFrame(
    voom_matrix,
    index=df_t.index,   # genes
    columns=df_t.columns  # samples (cell lines)
).transpose() 

# Remove the second row
voom_df_clean = voom_df.drop(voom_df.index[1])

# Rename columns label
voom_df.index.name = "SANGER_MODEL_ID"

print("✅ Voom-transformed matrix shape:", voom_df.shape)
voom_df.head()


✅ Voom-transformed matrix shape: (1362, 37602)


gene_id,SIDG00001,SIDG00002,SIDG00003,SIDG00004,SIDG00005,SIDG00006,SIDG00007,SIDG00008,SIDG00009,SIDG00010,...,SIDG42469,SIDG42470,SIDG42471,SIDG42472,SIDG42473,SIDG42474,SIDG42475,SIDG42479,SIDG42480,SIDG42481
SANGER_MODEL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SIDM00001,-6.793635,1.131178,-3.98628,-0.526848,-6.793635,-5.208672,-6.793635,-6.793635,-6.793635,-6.793635,...,2.110247,-6.793635,-6.793635,4.967501,-6.793635,5.897454,5.357065,3.633678,1.924042,1.256214
SIDM00002,-0.093907,2.296382,-8.346572,0.50205,-1.108168,0.249617,-6.76161,-6.024644,-6.76161,-1.660072,...,1.494205,-8.346572,6.237392,5.836365,-8.346572,-8.346572,6.895076,3.972253,1.61198,-0.739242
SIDM00003,-0.600179,3.433112,-8.560181,-8.560181,-0.683664,-6.238253,-8.560181,-8.560181,-4.472718,-3.202629,...,-8.560181,-3.430898,6.360219,-8.560181,-8.560181,7.069887,7.076982,-8.560181,1.960438,-8.560181
SIDM00005,1.455215,2.164904,-5.183824,0.242441,1.355335,-1.639503,-3.961432,-0.428936,-3.961432,-3.068347,...,1.900099,-6.768786,4.671564,-6.768786,-6.768786,-6.768786,5.653015,4.784363,2.954021,-0.082286
SIDM00006,-0.629975,2.634419,-4.031226,7.340196,-0.549163,-5.384863,-6.969825,-8.554788,-8.554788,-8.554788,...,-8.554788,-8.554788,-8.554788,9.074946,-8.554788,-8.554788,6.42923,-8.554788,1.471736,-1.223871


In [6]:
voom_df.to_parquet("../../data/processed/voom_transformed_data.parquet")
print("📁 Saved: voom_logCPM.parquet")
 

📁 Saved: voom_logCPM.parquet
