In [13]:
# Import necessary libraries
import pandas as pd
import os

def confirm_columns(df, expected_columns):
    """
    Text confirmation of column presence in the dataset.
    
    Args:
        df (pd.DataFrame): DataFrame containing the dataset.
        expected_columns (list): List of expected column names.
    
    Returns:
        None
    """
    # Check which columns are found and which are missing
    found_columns = [col for col in expected_columns if col in df.columns]
    missing_columns = [col for col in expected_columns if col not in df.columns]

    # Display results
    print("\n🔍 **Column Check Report** 🔍")
    print(f"\n✔️ Found Columns ({len(found_columns)}):")
    for col in found_columns:
        print(f"   - {col}")
    
    if missing_columns:
        print(f"\n❌ Missing Columns ({len(missing_columns)}):")
        for col in missing_columns:
            print(f"   - {col}")
    else:
        print("\n🎉 All expected columns are present!")

def load_and_clean_data(file_path, expected_columns):
    """
    Load and clean the dataset for CHO cell culture kinetics.
    - Confirms the presence of expected columns.
    - Removes spaces in column names.
    - Renames columns for clarity.
    - Cleans numerical data and converts to proper data types.
    - Converts 'Clone' column to categorical.
    
    Args:
        file_path (str): Path to the CSV file.
        expected_columns (list): List of expected column names.

    Returns:
        pd.DataFrame: A cleaned DataFrame ready for analysis.
    """
    # Load the data, skipping the first row if it contains metadata
    df = pd.read_csv(file_path, skiprows=1)
    
    # Remove leading and trailing spaces from column names
    df.columns = df.columns.str.strip()

    # Confirm the presence of expected columns
    confirm_columns(df, expected_columns)
    
    # Rename columns for better clarity
    df.rename(columns={
        "T": "Time (days)",               # Time of culture
        "G": "Glucose (g/L)",             # Glucose concentration
        "Gln": "Glutamine (mmol/L)",      # Glutamine concentration
        "Xv": "Viable Cells (cells/mL)",  # Viable cell density
        "Xd": "Dead Cells (cells/mL)",    # Dead cell density
        "L": "Lactate (g/L)",             # Lactate concentration
        "Glu": "Glutamate (mmol/L)",      # Glutamate concentration
        "V": "Viability (%)",             # Viability as a percentage (0-100)
        "MAb": "Antibody Concentration (mg/mL)",  # Monoclonal antibody concentration
        "rP": "Recombinant Protein (mg/mL)",      # Recombinant protein concentration
        "rep": "Replicate"                # Replicate number
    }, inplace=True)
    
    # Clean numeric columns, converting to proper numeric types
    for column in df.columns:
        if column == "Clone":  # Skip the 'Clone' column
            continue
        # Handle columns with string data (e.g., percentages)
        if df[column].dtype == object:
            df[column] = pd.to_numeric(df[column].str.replace('%', '', regex=False), errors='coerce')
        elif df[column].dtype in [int, float]:
            df[column] = pd.to_numeric(df[column], errors='coerce')
    
    # Convert 'Clone' column to a categorical data type
    if "Clone" in df.columns:
        df["Clone"] = df["Clone"].astype("category")
    
    return df

In [16]:
# Define the expected columns
expected_columns = [
    "Clone", "T", "G", "Gln", "Xv", "Xd", "L", "V", "MAb", "rP", "rep"
]

# Define the file path for the dataset
data_dir = os.path.join(os.getcwd(), 'data')  # Data is located in a subdirectory called 'data'
dataset_path = os.path.join(data_dir, '2024-05-18_Clones_B_C_Kinetics.csv')

# Load and clean the dataset
kinetics_data = load_and_clean_data(dataset_path, expected_columns)

# Display the first few rows of the cleaned data
display(kinetics_data.head())


🔍 **Column Check Report** 🔍

✔️ Found Columns (9):
   - Clone
   - T
   - G
   - Gln
   - Xv
   - L
   - V
   - MAb
   - rep

❌ Missing Columns (2):
   - Xd
   - rP


Unnamed: 0,Clone,Time (days),Viable Cells (cells/mL),Viability (%),Glucose (g/L),Lactate (g/L),Glutamine (mmol/L),Glutamate (mmol/L),Antibody Concentration (mg/mL),Replicate
0,B,0.0,284000.0,100.0,7.419121,0.004255,5.575516,1.821962,0.0,1
1,B,0.0,288000.0,100.0,7.515516,0.004507,5.572859,1.811074,0.0,2
2,B,0.0,278000.0,100.0,7.804288,0.004333,5.487487,1.702079,0.0,3
3,C,0.0,302000.0,100.0,8.03155,0.00432,5.988051,1.876903,0.0,1
4,C,0.0,298000.0,100.0,6.737842,0.003946,5.3397,1.918827,0.0,2
