# Data Preprocessing: Loading and Annotating Flow Cytometry Files

This script is part of the **CHOFlow** repository and focuses on preprocessing flow cytometry data from CHO cell experiments. The main functionalities of this script include:

1. **Loading `.fcs` Files**: Automatically retrieves and processes raw data from `.fcs` files stored in structured folders.
2. **Annotating Data**: Adds experimental metadata such as `Time`, `Clone`, `Replicate`, and `Sample_Type` to each event in the dataset.
3. **Sample Type Classification**: Uses conditions and regular expressions to categorize data as `ST` (staining control), `Control`, or `Sample`.
4. **Exporting Processed Data**: Consolidates all processed data into a single CSV file for downstream analysis.

### Output
The final output is a `processed_data.csv` file containing the annotated flow cytometry data, ready for visualization and analysis.

### Requirements
- Python 3.7+
- `pandas`
- `numpy`
- `FlowKit`

This script serves as the foundational step in automating and streamlining the analysis of CHO cell flow cytometry data.

In [None]:
import os
import re
import pandas as pd
import numpy as np
import flowkit as fk
from datetime import datetime
import matplotlib.pyplot as plt
from IPython.display import display

In [None]:
# Main directory where the script is executed
main_directory = os.getcwd()

# Retrieve all subdirectories representing time points in the main directory
time_folders = [
    folder for folder in os.listdir(main_directory)
    if os.path.isdir(os.path.join(main_directory, folder))
]

# Initialize an empty list to store DataFrames
dataframes = []

# Process each time folder to load data and assign metadata
for folder in time_folders:
    # Build the path to the "JC1" subfolder
    jc1_folder_path = os.path.join(main_directory, folder, "JC1")
    
    # Skip if the "JC1" folder does not exist
    if not os.path.isdir(jc1_folder_path):
        continue

    # List all .fcs files in the "JC1" folder
    fcs_files = [file for file in os.listdir(jc1_folder_path) if file.endswith(".fcs")]

    # Process each .fcs file and add metadata
    for file in fcs_files:
        file_path = os.path.join(jc1_folder_path, file)
        
        # Load the .fcs file using FlowKit
        sample = fk.Sample(file_path)

        # Retrieve raw event data as a DataFrame
        events_df = pd.DataFrame(
            sample.get_events(source="raw"),
            columns=sample.channels["pnn"]
        )

        # Add file and folder metadata
        events_df["File_Name"] = file  # Source file name
        events_df["Folder"] = folder  # Parent folder name
        events_df["Time"] = folder[-1]  # Extract the last character of the folder name as "Time"

        # Append the processed DataFrame to the list
        dataframes.append(events_df)

# Concatenate all DataFrames into a single DataFrame
full_data = pd.concat(dataframes, ignore_index=True)

# Print the shape of the final DataFrame for verification
print(f"Data successfully processed. Final DataFrame contains {full_data.shape[0]} rows and {full_data.shape[1]} columns.")


In [None]:
# Use regular expressions to extract clone and replicate information in a vectorized manner
full_data[['Clone', 'Replicate']] = full_data['File_Name'].str.extract(
    r'_(b|c)(\d)_', 
    flags=re.IGNORECASE
)

# Convert Clone to lowercase and Replicate to string to handle potential null values
full_data['Clone'] = full_data['Clone'].str.lower()


# Conditions to determine the sample type
conditions = [
    full_data['File_Name'].str.contains(r'_ST', case=False, na=False),  # Staining controls
    full_data['File_Name'].str.contains(r'controles', case=False, na=False),  # CCCP controls
    full_data['File_Name'].str.contains(r'muestras', case=False, na=False)  # Experimental samples
]

# Corresponding results for the conditions
choices = ['ST', 'Control', 'Sample']

# Use np.select to assign Sample_Type based on the conditions
full_data['Sample_Type'] = np.select(conditions, choices, default='Unknown')

# Assign 'ST' to Clone for staining controls and 'CCCP' for CCCP controls
full_data.loc[full_data['Sample_Type'] == 'ST', 'Clone'] = 'ST'
full_data.loc[full_data['Sample_Type'] == 'Control', 'Clone'] = 'CCCP'





In [None]:
# Group by 'Time', 'File_Name', and 'Sample_Type' to review the label for each file
file_label_by_time = full_data.groupby(['Time', 'File_Name'])['Sample_Type'].unique().reset_index()

# Configure pandas to display all rows
pd.set_option('display.max_rows', None)

# Display the entire DataFrame
display(file_label_by_time)

# Reset pandas display options to avoid global application
pd.reset_option('display.max_rows')

In [None]:
# Define the path for the "data" folder
data_folder_path = os.path.join(main_directory, "data")

# Check if the "data" folder exists; if not, create it
if not os.path.exists(data_folder_path):
    os.makedirs(data_folder_path)
    print(f"'data' folder created at: {data_folder_path}")

# Define the path and name of the output file within the "data" folder
output_file_path = os.path.join(data_folder_path, "processed_data.csv")

# Export the DataFrame to a CSV file
full_data.to_csv(output_file_path, index=False)

print(f"DataFrame successfully exported to: {output_file_path}")