In [15]:
import pandas as pd
import os

def confirm_columns(df, expected_columns):
    """
    Text confirmation of column presence in the dataset.
    
    Args:
        df (pd.DataFrame): DataFrame containing the dataset.
        expected_columns (list): List of expected column names.
    
    Returns:
        None
    """
    found_columns = [col for col in expected_columns if col in df.columns]
    missing_columns = [col for col in expected_columns if col not in df.columns]

    print("\n🔍 **Column Check Report** 🔍")
    print(f"\n✔️ Found Columns ({len(found_columns)}):")
    for col in found_columns:
        print(f"   - {col}")
    
    if missing_columns:
        print(f"\n❌ Missing Columns ({len(missing_columns)}):")
        for col in missing_columns:
            print(f"   - {col}")
    else:
        print("\n🎉 All expected columns are present!")

def load_and_clean_data(file_path, expected_columns):
    """
    Load and clean the dataset for CHO cell culture kinetics.
    - Confirms the presence of expected columns.
    - Removes spaces in column names.
    - Renames columns for clarity.
    - Cleans numerical data and converts to proper data types.
    - Converts 'Clone' column to categorical.
    
    Args:
        file_path (str): Path to the CSV file.
        expected_columns (list): List of expected column names.

    Returns:
        pd.DataFrame: A cleaned DataFrame ready for analysis.
    """
    df = pd.read_csv(file_path, skiprows=1)
    df.columns = df.columns.str.strip()
    confirm_columns(df, expected_columns)
    
    df.rename(columns={
        "T": "Time (days)",
        "G": "Glucose (g/L)",
        "Gln": "Glutamine (mmol/L)",
        "Xv": "Viable Cells (cells/mL)",
        "Xd": "Dead Cells (cells/mL)",
        "L": "Lactate (g/L)",
        "Glu": "Glutamate (mmol/L)",
        "V": "Viability (%)",
        "MAb": "Antibody Concentration (mg/mL)",
        "rP": "Recombinant Protein (mg/mL)",
        "rep": "Replicate"
    }, inplace=True)
    
    for column in df.columns:
        if column == "Clone":
            continue
        if df[column].dtype == object:
            df[column] = pd.to_numeric(df[column].str.replace('%', '', regex=False), errors='coerce')
        elif df[column].dtype in [int, float]:
            df[column] = pd.to_numeric(df[column], errors='coerce')
    
    if "Clone" in df.columns:
        df["Clone"] = df["Clone"].astype("category")
    
    return df

# Define the expected columns
expected_columns = [
    "Clone", "T", "G", "Gln", "Xv", "Xd", "L", "V", "MAb", "rP", "rep"
]

# Simplify the file path definition
dataset_path = 'data/2024-05-18_Clones_B_C_Kinetics.csv'

# Load and clean the dataset
kinetics_data = load_and_clean_data(dataset_path, expected_columns)

# Display the first few rows of the cleaned data
print(kinetics_data.head())


FileNotFoundError: [Errno 2] No such file or directory: 'data/2024-05-18_Clones_B_C_Kinetics.csv'

In [14]:
# Define the expected columns
expected_columns = [
    "Clone", "T", "G", "Gln", "Xv", "Xd", "L", "V", "MAb", "rP", "rep"
]

# Define the file path relative to the script/notebook location
current_dir = os.path.dirname(__file__)  # Directory of the current script/notebook
dataset_path = os.path.join(current_dir, '..', 'data', '2024-05-18_Clones_B_C_Kinetics.csv')

# Load and clean the dataset
kinetics_data = load_and_clean_data(dataset_path, expected_columns)

# Display the first few rows of the cleaned data
display(kinetics_data.head())


NameError: name '__file__' is not defined

In [2]:
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Set the plot theme
sns.set_theme(style="darkgrid")

def create_output_directory(csv_filename):
    """
    Create an output directory based on the name of the CSV file.
    
    Args:
        csv_filename (str): Name of the input CSV file.
    
    Returns:
        str: Path to the created output directory.
    """
    # Extract the base name without extension
    base_name = os.path.splitext(os.path.basename(csv_filename))[0]
    output_dir = os.path.join(os.getcwd(), f"plots_{base_name}")
    os.makedirs(output_dir, exist_ok=True)
    return output_dir

# Define the name of the CSV file for this kinetic experiment
csv_file = "2024-05-18_Clones_B_C_Kinetics.csv"

# Create an output directory
output_dir = create_output_directory(csv_file)

# Example DataFrame name: Replace `df` with your cleaned DataFrame
df = kinetics_data  # Assuming kinetics_data is the cleaned DataFrame

# Plot: Time vs VCD
plt.figure(figsize=(10, 6))
sns.lineplot(x="Time (days)", y="Viable Cells (cells/mL)", hue="Clone", style="Clone", markers=True, data=df)
plt.xlabel('Time (days)')
plt.ylabel('Viable Cells (cells/mL)')
plt.ylim(0, 25000000)
plt.title('Time vs Viable Cell Density (VCD)')
plt.grid(True)
plt.legend(title='Clone')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "time_vs_vcd.png"), format="png", dpi=300)
plt.show()

# Plot: Time vs Viability
plt.figure(figsize=(10, 6))
sns.lineplot(x="Time (days)", y="Viability (%)", hue="Clone", style="Clone", markers=True, data=df)
plt.xlabel('Time (days)')
plt.ylabel('Viability (%)')
plt.title('Time vs Viability')
plt.ylim(0, 110)
plt.grid(True)
plt.legend(title='Clone')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "time_vs_viability.png"), format="png", dpi=300)
plt.show()

# Subplots: Glucose and Lactate vs Time
fig, axes = plt.subplots(2, 1, figsize=(10, 12))
sns.lineplot(ax=axes[0], x="Time (days)", y="Glucose (g/L)", hue="Clone", style="Clone", markers=True, data=df)
axes[0].set_title('Time vs Glucose')
axes[0].set_ylabel('Glucose (g/L)')
axes[0].set_ylim(0, 10)
axes[0].grid(True)
axes[0].legend(title='Clone')

sns.lineplot(ax=axes[1], x="Time (days)", y="Lactate (g/L)", hue="Clone", style="Clone", markers=True, data=df)
axes[1].set_title('Time vs Lactate')
axes[1].set_ylabel('Lactate (g/L)')
axes[1].set_ylim(0, 15)
axes[1].grid(True)
axes[1].legend(title='Clone')

fig.tight_layout()
fig.savefig(os.path.join(output_dir, "time_vs_glucose_and_lactate.png"), format="png", dpi=300)
plt.show()

# Subplots: Glutamine and Glutamate vs Time
fig, axes = plt.subplots(2, 1, figsize=(10, 12))
sns.lineplot(ax=axes[0], x="Time (days)", y="Glutamine (mmol/L)", hue="Clone", style="Clone", markers=True, data=df)
axes[0].set_title('Time vs Glutamine')
axes[0].set_ylabel('Glutamine (mmol/L)')
axes[0].set_ylim(0, 8)
axes[0].grid(True)
axes[0].legend(title='Clone')

sns.lineplot(ax=axes[1], x="Time (days)", y="Glutamate (mmol/L)", hue="Clone", style="Clone", markers=True, data=df)
axes[1].set_title('Time vs Glutamate')
axes[1].set_ylabel('Glutamate (mmol/L)')
axes[1].set_ylim(0, 50)
axes[1].grid(True)
axes[1].legend(title='Clone')

fig.tight_layout()
fig.savefig(os.path.join(output_dir, "time_vs_glutamine_and_glutamate.png"), format="png", dpi=300)
plt.show()

# Plot with two Y axes: Time vs VCD and Viability
fig, ax1 = plt.subplots(figsize=(10, 6))
ax1.set_xlabel('Time (days)')
ax1.set_ylabel('Viable Cells (cells/mL)', color='tab:blue')
sns.lineplot(ax=ax1, x="Time (days)", y="Viable Cells (cells/mL)", hue="Clone", style="Clone", markers=True, data=df, legend=False)
ax1.tick_params(axis='y', labelcolor='tab:blue')
ax1.set_ylim(0, 25000000)

ax2 = ax1.twinx()
ax2.set_ylabel('Viability (%)', color='tab:red')
sns.lineplot(ax=ax2, x="Time (days)", y="Viability (%)", hue="Clone", style="Clone", markers=True, data=df)
ax2.tick_params(axis='y', labelcolor='tab:red')
ax2.set_ylim(-0.5, 105)

plt.title('Time vs VCD and Viability')
fig.tight_layout()
plt.grid(True)
fig.savefig(os.path.join(output_dir, "time_vs_vcd_and_viability.png"), format="png", dpi=300)
plt.show()


NameError: name 'kinetics_data' is not defined

In [16]:
import pandas as pd
import os

def confirm_columns(df, expected_columns):
    """
    Text confirmation of column presence in the dataset.
    
    Args:
        df (pd.DataFrame): DataFrame containing the dataset.
        expected_columns (list): List of expected column names.
    
    Returns:
        None
    """
    found_columns = [col for col in expected_columns if col in df.columns]
    missing_columns = [col for col in expected_columns if col not in df.columns]

    print("\n🔍 **Column Check Report** 🔍")
    print(f"\n✔️ Found Columns ({len(found_columns)}):")
    for col in found_columns:
        print(f"   - {col}")
    
    if missing_columns:
        print(f"\n❌ Missing Columns ({len(missing_columns)}):")
        for col in missing_columns:
            print(f"   - {col}")
    else:
        print("\n🎉 All expected columns are present!")

def load_and_clean_data(file_path, expected_columns):
    """
    Load and clean the dataset for CHO cell culture kinetics.
    - Confirms the presence of expected columns.
    - Removes spaces in column names.
    - Renames columns for clarity.
    - Cleans numerical data and converts to proper data types.
    - Converts 'Clone' column to categorical.
    
    Args:
        file_path (str): Path to the CSV file.
        expected_columns (list): List of expected column names.

    Returns:
        pd.DataFrame: A cleaned DataFrame ready for analysis.
    """
    df = pd.read_csv(file_path, skiprows=1)
    df.columns = df.columns.str.strip()
    confirm_columns(df, expected_columns)
    
    df.rename(columns={
        "T": "Time (days)",
        "G": "Glucose (g/L)",
        "Gln": "Glutamine (mmol/L)",
        "Xv": "Viable Cells (cells/mL)",
        "Xd": "Dead Cells (cells/mL)",
        "L": "Lactate (g/L)",
        "Glu": "Glutamate (mmol/L)",
        "V": "Viability (%)",
        "MAb": "Antibody Concentration (mg/mL)",
        "rP": "Recombinant Protein (mg/mL)",
        "rep": "Replicate"
    }, inplace=True)
    
    for column in df.columns:
        if column == "Clone":
            continue
        if df[column].dtype == object:
            df[column] = pd.to_numeric(df[column].str.replace('%', '', regex=False), errors='coerce')
        elif df[column].dtype in [int, float]:
            df[column] = pd.to_numeric(df[column], errors='coerce')
    
    if "Clone" in df.columns:
        df["Clone"] = df["Clone"].astype("category")
    
    return df

# Define the expected columns
expected_columns = [
    "Clone", "T", "G", "Gln", "Xv", "Xd", "L", "V", "MAb", "rP", "rep"
]

# Simplify the file path definition
dataset_path = 'data/2024-05-18_Clones_B_C_Kinetics.csv'

# Load and clean the dataset
kinetics_data = load_and_clean_data(dataset_path, expected_columns)

# Display the first few rows of the cleaned data
print(kinetics_data.head())


FileNotFoundError: [Errno 2] No such file or directory: 'data/2024-05-18_Clones_B_C_Kinetics.csv'

In [None]:
PendingDeprecationWarning


 El volumen de la unidad C no tiene etiqueta.
 El n�mero de serie del volumen es: CE09-4C7F

 Directorio de c:\Users\ebald\OneDrive\Documentos\GitHub\clonalyzer\scripts

01/12/2024  11:03 p. m.    <DIR>          .
01/12/2024  10:48 p. m.    <DIR>          ..
01/12/2024  10:16 p. m.            10,780 clonalyzer.ipynb
               1 archivos         10,780 bytes
               2 dirs  407,275,642,880 bytes libres
