# Assessing and addressing transcriptomics batch effect in CCF data

With the purpose of segmenting multi-omics data from the Chron's & Colitis Foundation (CCF), we deploy Non-negative Matrix Factorization (NMF) upon the transcriptomics data, as we did before, to replicate the batch effect identification.

In [1]:
import pandas as pd
import awswrangler as wr
import copy
from pydeseq2 import preprocessing

## Data processing

This is merely a reformatting and documented version of Daniel Ence's original processing in his [original notebook](https://github.com/enveda/dx-transcriptomics/blob/main/notebooks/scratch.ipynb). 

In [2]:
data = wr.s3.read_csv(
    "transcriptomics_raw_data.txt.gz",
    sep="\t",
    names=["path", "symbol", "gene_id", "count"],
    compression="gzip",
)

In [3]:
data.head(5)

Unnamed: 0,path,symbol,gene_id,count
0,sparc-cd-genewiz/hit-counts/hit-counts/FS01628...,"DDX11L1, transcribed_unprocessed_pseudogene, H...",ENSG00000223972,0
1,sparc-cd-genewiz/hit-counts/hit-counts/FS01628...,"WASH7P, unprocessed_pseudogene, HGNC:38034",ENSG00000227232,69
2,sparc-cd-genewiz/hit-counts/hit-counts/FS01628...,"MIR6859-1, miRNA, HGNC:50039",ENSG00000278267,21
3,sparc-cd-genewiz/hit-counts/hit-counts/FS01628...,"MIR1302-2HG, lncRNA, HGNC:52482",ENSG00000243485,0
4,sparc-cd-genewiz/hit-counts/hit-counts/FS01628...,"FAM138A, lncRNA, HGNC:32334",ENSG00000237613,0


### Standardizing column naming

Unpacking the path column into `batch`, `filename` and `sample_id`.

In [4]:
# Generate the batch column from the first part of path
data["batch"] = data["path"].str.split("/").str[0]

# Generate the filename columm from the fourth part of path
data["filename"] = data["path"].str.split("/").str[3]

# Get the sample_id from the filename, as the part before the first dot
data["sample_id"] = data["filename"].str.split(".").str[0]

# Get the symbol parts, as an intermediate step to get the symbol, feature_type, and HGNC_id
data["symbol_parts"] = data["symbol"].str.split(",")

# Get the symbol, feature_type, and HGNC_id from the symbol_parts
data["symbol"] = data["symbol_parts"].str[0]
data["feature_type"] = data["symbol_parts"].str[1].str.strip()
data["HGNC_id"] = data["symbol_parts"].str[2].str.strip()

# Drop the symbol_parts, HGNC_id, feature_type, path, and filename columns, as they are no longer needed
data.drop(
    columns=["symbol_parts", "HGNC_id", "feature_type", "path", "filename"],
    inplace=True,
)

# Convert the count column to int
data["sample_id"] = data["sample_id"].astype("string")

In [5]:
# Midpoint copy because some of the steps might have issues and need to be rerun because of memory
mid_processed = copy.deepcopy(data)

# Concatenate the symbol and gene_id to get the gene_name
mid_processed["gene_name"] = mid_processed["symbol"] + "_" + mid_processed["gene_id"]

# Generate a new column for the expanded sample_id
mid_processed["expand_sample"] = (
    mid_processed["sample_id"] + "_" + mid_processed["batch"]
)

# Drop the symbol, gene_id, smaple_od, and batch columns, as they are no longer needed
mid_processed = mid_processed.drop(columns=["symbol", "gene_id", "sample_id", "batch"])

# Unnamed genes values appear as duplicated, so lets drop them as we have no way to distinguish
mid_processed = mid_processed.dropna(subset=["gene_name"])

### Normalizing and dropping low variance

Pivot the table and isolate `sample_id` and `batch`.

In [None]:
# Pivot the mid_processed dataframe to get the genes by samples dataframe
genes_by_samples_df = mid_processed.pivot(
    index="gene_name", columns="expand_sample", values="count"
)

# Transpose the genes_by_samples_df to get the samples by genes dataframe
genes_by_samples_df = genes_by_samples_df.T

genes_by_samples_df["sample_id"] = genes_by_samples_df.index.str.split("_").str[0]
genes_by_samples_df["batch"] = genes_by_samples_df.index.str.split("_").str[1]

Perform normalization with `pydeseq2` on each of the batches, multiply by the size factors, then concatenate the table back together.

In [None]:
# Identify the unique batches
unique_batches = genes_by_samples_df["batch"].unique()
processed_batches = []
for batch in unique_batches:
    batch_df = genes_by_samples_df[genes_by_samples_df["batch"] == batch]

    # Isoalte the sample_id and batch columns
    batch_ids = batch_df[["sample_id", "batch"]]

    # Isolate the counts columns
    batch_counts = batch_df.drop(columns=["sample_id", "batch"])

    # Normalize the counts using pydeseq2
    pydeseq2_results_counts, pydeseq2_results_size_factors = preprocessing.deseq2_norm(
        batch_counts.values
    )

    # Convert the pydeseq2_results_counts to a dataframe
    pydeseq2_dataframe = pd.DataFrame(
        pydeseq2_results_counts,
        columns=batch_counts.columns,
        index=batch_counts.index,
    )

    # Multiply pydeseq2_dataframe columns by pydeseq2_results_size_factors
    final_pydeseq2_dataframe = pydeseq2_dataframe.mul(
        pydeseq2_results_size_factors, axis=0
    )
    final_pydeseq2_dataframe[["sample_id", "batch"]] = batch_ids
    processed_batches.append(final_pydeseq2_dataframe)

# Concatenate the processed batches batch together
processed_batches_df = pd.concat(processed_batches, axis=0)

Drop columns with low variance

In [None]:
# Drop column with 0 variance

processed_batches_ids = processed_batches_df[["sample_id", "batch"]]

final_processed_df = processed_batches_df.drop(columns=["sample_id", "batch"]).loc[
    :, processed_batches_df.drop(columns=["sample_id", "batch"]).var() != 0
]
final_processed_df[["sample_id", "batch"]] = processed_batches_ids

final_processed_df = final_processed_df.reset_index(drop=True)

Keep only the columns with gene name

In [None]:
# Keep only the columns with gene names
only_named_columns = []
for x in final_processed_df.columns:
    split_column = x.split("_")
    if split_column[0] != "":
        only_named_columns.append(x)

final_processed_df = final_processed_df[only_named_columns]

Write the output table

In [None]:
final_processed_df.to_parquet("transcriptomics_pydeseq_corrected.parquet", index=False)