# Quality Control and Filtering for Raw Data

In [2]:
import os
import importlib
import logging
import pandas as pd
import scanpy as sc
import cv2
import numpy as np
from typing import Tuple, List, Dict, Optional, Union
import inspect
import sys
import spatioloji as sj

## Prepare original data
     - Set up directory paths for original data/saved data
     - Load original data
     - Check rows/columns of all tables with necessary modifications
     - Save data

In [None]:
# load original data
data_read = './test_data/' # for raw data 
data_save = './test_data/' # for processed data or intermediate result
analysis_save = './test_analysis/' # for plots
image_dir = './test_data/image/'

# Load expression matrix and cell metadata
expr_matrix = pd.read_csv(data_read+'Run5452_S2_exprMat_file.csv')
cell_metadata = pd.read_csv(data_read+'Run5452_S2_metadata_file.csv')

# Create unique cell identifiers by combining FOV and cell ID
expr_matrix['fov'] = expr_matrix['fov'].astype(str)
expr_matrix['cell'] = expr_matrix['fov'].astype(str)+'_'+expr_matrix['cell_ID'].astype(str)
expr_matrix.index = expr_matrix.cell
cell_metadata['fov'] = cell_metadata['fov'].astype(str)
cell_metadata['cell'] = cell_metadata['fov'].astype(str)+'_'+cell_metadata['cell_ID'].astype(str)
cell_metadata.index = cell_metadata.cell

# Filter to keep only cells that exist in both expression and metadata
common_cells = expr_matrix.index.intersection(cell_metadata.index)
expr_matrix = expr_matrix.loc[common_cells]
cell_metadata = cell_metadata.loc[common_cells]

# Load spatial information (FOV positions and cell polygons)
fov_positions_orig = pd.read_csv(data_read+'Run5452_S2_fov_positions_file.csv')
fov_positions_orig['fov'] = fov_positions_orig['fov'].astype(str)
polygon_file_orig = pd.read_csv(data_read+'Run5452_S2-polygons.csv')
polygon_file_orig['fov'] = polygon_file_orig['fov'].astype(str)
polygon_file_orig['cell'] = polygon_file_orig['fov'].astype(str)+'_'+polygon_file_orig['cell_ID'].astype(str)


## Instantiation of Spatioloji_qc object
    - Set up output folder
    - Read expr matrix and matched cell metadata
    - QC pipeline 
    - Save data 

In [None]:
# Create Spatioloji_qc object for filtering and QC
sp_ji_qc = sj.data.spatioloji_qc(output_dir=analysis_save)

# # Or directly set data
sp_ji_qc.expr_matrix = expr_matrix
sp_ji_qc.cell_metadata = cell_metadata

# Run QC pipeline
filtered_cells, filtered_genes = sp_ji_qc.run_qc_pipeline()
fov_positions = fov_positions_orig[fov_positions_orig['fov'].isin(filtered_cells.fov.tolist())]
polygon_file = polygon_file_orig[polygon_file_orig['cell'].isin(filtered_cells.index.tolist())]
cell_metadata = cell_metadata[cell_metadata['cell'].isin(filtered_cells.index.tolist())]
cell_metadata.index.name = None

# save filtered files
fov_positions.to_csv(data_save+'filtered_fov_positions.csv') # under normal circumstance, you do not need to save it because no fovs filtered out
polygon_file.to_csv(data_save+'filtered_polygons.csv')
cell_metadata.to_csv(data_save+'filtered_metadata_file.csv')
filtered_genes.to_csv(data_save+'filtered_exprMat_file.csv')
cell_metadata[['CenterX_local_px', 'CenterY_local_px']].to_csv(data_save+'filtered_coordinates.csv')

