In [5]:
import pandas as pd

# Load your long-form RNA-seq expression file
df = pd.read_csv("../../data/original/rnaseq_all_data.csv")  # 🔁 Replace with your path
print("✅ Raw shape:", df.shape)
df.head()


✅ Raw shape: (46733067, 11)


Unnamed: 0,dataset_id,id,model_id,gene_id,read_count,fpkm,tpm,data_source,dataset_name,model_name,gene_symbol
0,22,133594790,SIDM01313,SIDG03516,919.0,5.69,14.41,Sanger,Sanger Organoid RNASeq,HCM-SANG-0285-C18,CASP10
1,22,133630300,SIDM01313,SIDG21420,90.0,0.25,0.64,Sanger,Sanger Organoid RNASeq,HCM-SANG-0285-C18,NBPF10
2,22,133630301,SIDM01313,SIDG31799,1.0,0.15,0.39,Sanger,Sanger Organoid RNASeq,HCM-SANG-0285-C18,RPL17P51
3,22,133630302,SIDM01313,SIDG25351,0.0,0.0,0.0,Sanger,Sanger Organoid RNASeq,HCM-SANG-0285-C18,PPATP2
4,22,133630303,SIDM01313,SIDG19863,88.0,1.31,3.32,Sanger,Sanger Organoid RNASeq,HCM-SANG-0285-C18,MMP28


In [6]:
# Keep only rows with valid raw htseq counts
df = df[~df["tpm"].isna()].copy()
df["tpm"] = df["tpm"].astype("float32")

print("✅ Filtered count data:", df.shape)
df.head()


✅ Filtered count data: (46733066, 11)


Unnamed: 0,dataset_id,id,model_id,gene_id,read_count,fpkm,tpm,data_source,dataset_name,model_name,gene_symbol
0,22,133594790,SIDM01313,SIDG03516,919.0,5.69,14.41,Sanger,Sanger Organoid RNASeq,HCM-SANG-0285-C18,CASP10
1,22,133630300,SIDM01313,SIDG21420,90.0,0.25,0.64,Sanger,Sanger Organoid RNASeq,HCM-SANG-0285-C18,NBPF10
2,22,133630301,SIDM01313,SIDG31799,1.0,0.15,0.39,Sanger,Sanger Organoid RNASeq,HCM-SANG-0285-C18,RPL17P51
3,22,133630302,SIDM01313,SIDG25351,0.0,0.0,0.0,Sanger,Sanger Organoid RNASeq,HCM-SANG-0285-C18,PPATP2
4,22,133630303,SIDM01313,SIDG19863,88.0,1.31,3.32,Sanger,Sanger Organoid RNASeq,HCM-SANG-0285-C18,MMP28


In [7]:
# Pivot: rows = cell lines (model_id), columns = genes (gene_id), values = counts
cell_gene_matrix = df.pivot_table(
    index="model_id",
    columns="gene_id",         # or use "gene_symbol" or "ensembl_gene_id"
    values="tpm",
    aggfunc="sum"              # Just in case there are duplicates
)

# Fill NAs with 0 (assume no expression)
cell_gene_matrix = cell_gene_matrix.fillna(0).astype(int)

print("✅ Expression matrix shape:", cell_gene_matrix.shape)
cell_gene_matrix.head()


✅ Expression matrix shape: (1362, 37602)


gene_id,SIDG00001,SIDG00002,SIDG00003,SIDG00004,SIDG00005,SIDG00006,SIDG00007,SIDG00008,SIDG00009,SIDG00010,...,SIDG42469,SIDG42470,SIDG42471,SIDG42472,SIDG42473,SIDG42474,SIDG42475,SIDG42479,SIDG42480,SIDG42481
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SIDM00001,0,2,0,0,0,0,0,0,0,0,...,2,0,0,28,0,11,14,8,6,2
SIDM00002,0,6,0,0,0,0,0,0,0,1,...,2,0,31,56,0,0,45,11,5,0
SIDM00003,0,11,0,0,0,0,0,0,0,0,...,0,0,29,0,0,25,44,0,5,0
SIDM00005,3,7,0,0,5,0,0,9,0,0,...,3,0,14,0,0,0,26,28,18,1
SIDM00006,0,7,0,86,1,0,0,0,0,0,...,0,0,0,525,0,0,32,0,4,0


In [8]:
# Save to Parquet (efficient for large matrices)
cell_gene_matrix.to_parquet("../../data/original/cell_gene_matrix_tpm.parquet")

print("📁 Saved as Parquet: cell_gene_matrix.parquet")


📁 Saved as Parquet: cell_gene_matrix.parquet
