# Comparison of Mutant and Wildtype Osteocyte Cell Morphology

This notebook compares cell morphology features between wildtype (control) and mutant (LTBP3-deficient) osteocytes.
It loads data from CSV files in the `results/metrics/wildtype/` and `results/metrics/mutant/` folders.
If no CSVs are found (e.g., pipeline not run yet), it uses hardcoded example data as a fallback.

Analyses:
1. PCA of cell-morphology features (wildtype: blue circles, mutant: red triangles).
2. Violin plots for each feature with p-values (Mann-Whitney U test).

Required libraries: pandas, numpy, matplotlib, seaborn, scikit-learn, scipy.

**Note**: Run the main workflow script first to generate CSVs in `results/metrics/`.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy.stats import mannwhitneyu
from pathlib import Path

# Define paths to metrics folders
metrics_dir = Path('../results/metrics')  # Adjust if notebook is in notebooks folder
wildtype_dir = metrics_dir / 'wildtype'
mutant_dir = metrics_dir / 'mutant'

# Features to analyze
features = [
    'area', 'convex_area', 'bbox_area', 'equivalent_diameter',
    'perimeter', 'extent', 'eccentricity', 'form_factor',
    'solidity', 'compactness'
]

In [None]:
# Load all CSV files from a directory and add condition label
def load_csvs_from_dir(directory: Path, condition: str) -> pd.DataFrame:
    dfs = []
    for csv_path in directory.glob('*.csv'):
        df = pd.read_csv(csv_path)
        df['condition'] = condition
        dfs.append(df)
    if dfs:
        return pd.concat(dfs, ignore_index=True)
    else:
        print(f'No CSV files found in {directory}. Using hardcoded example data as fallback.')
        return pd.DataFrame()

# Load wildtype and mutant data
df_wildtype = load_csvs_from_dir(wildtype_dir, 'wildtype')
df_mutant = load_csvs_from_dir(mutant_dir, 'mutant')

# If both are empty, use hardcoded example data
if df_wildtype.empty and df_mutant.empty:
    print('Loading hardcoded example data.')
    # Hardcoded mutant example
    df_mutant = pd.DataFrame({
        'label': [1, 2, 4, 7, 8, 10],
        'area': [113.0, 20.0, 14.0, 14.0, 998.0, 25.0],
        'convex_area': [122.0, 21.0, 14.0, 14.0, 1580.0, 29.0],
        'bbox_area': [144.0, 24.0, 16.0, 15.0, 2407.0, 35.0],
        'equivalent_diameter': [11.9948, 5.0463, 4.2220, 4.2220, 35.6468, 5.6419],
        'perimeter': [46.5563, 14.6213, 10.8284, 11.4142, 281.5219, 16.8640],
        'extent': [0.7847, 0.8333, 0.8750, 0.9333, 0.4146, 0.7143],
        'eccentricity': [0.8070, 0.7289, 0.2037, 0.8165, 0.9759, 0.8250],
        'form_factor': [0.6551, 1.1756, 1.5004, 1.3503, 0.1582, 1.1047],
        'solidity': [0.9262, 0.9524, 1.0, 1.0, 0.6316, 0.8621],
        'compactness': [1.5264, 0.8506, 0.6665, 0.7405, 6.3195, 0.9053],
        'condition': ['mutant'] * 6
    })

    # Hardcoded wildtype example
    df_wildtype = pd.DataFrame({
        'label': [2, 3, 5, 9, 12, 16],
        'area': [121.0, 26.0, 21.0, 72.0, 38.0, 14.0],
        'convex_area': [211.0, 29.0, 29.0, 125.0, 45.0, 15.0],
        'bbox_area': [465.0, 35.0, 42.0, 238.0, 49.0, 16.0],
        'equivalent_diameter': [12.4122, 5.7536, 5.1709, 9.5746, 6.9558, 4.2220],
        'perimeter': [76.5269, 18.4853, 16.8284, 52.7635, 26.4853, 12.2426],
        'extent': [0.2602, 0.7429, 0.5000, 0.3025, 0.7755, 0.8750],
        'eccentricity': [0.9772, 0.7227, 0.8009, 0.9538, 0.6467, 0.5668],
        'form_factor': [0.2596, 0.9562, 0.9318, 0.3250, 0.6807, 1.1738],
        'solidity': [0.5735, 0.8966, 0.7241, 0.5760, 0.8444, 0.9333],
        'compactness': [3.8515, 1.0458, 1.0731, 3.0770, 1.4690, 0.8519],
        'condition': ['wildtype'] * 6
    })

# Combine into one DataFrame
df_all = pd.concat([df_wildtype, df_mutant], ignore_index=True)

# Filter to only the required features and drop NaNs
df_features = df_all[features + ['condition']].dropna()

# Display summary
df_features.groupby('condition').describe()

## 1. PCA of Cell-Morphology Features

In [None]:
# Prepare data for PCA
X = df_features[features]
y = df_features['condition']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform PCA (2 components for plotting)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Create DataFrame for plotting
df_pca = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
df_pca['condition'] = y.values

# Plot PCA
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df_pca[df_pca['condition'] == 'wildtype'], x='PC1', y='PC2', color='blue', marker='o', label='Wildtype (Control)')
sns.scatterplot(data=df_pca[df_pca['condition'] == 'mutant'], x='PC1', y='PC2', color='red', marker='^', label='Mutant (LTBP3)')
plt.title('PCA of Cell Morphology Features')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
plt.legend()
plt.show()

## 2. Violin Plots of Features with P-Values

In [None]:
# Function to compute p-value (Mann-Whitney U test)
def compute_p_value(feature):
    wild = df_features[df_features['condition'] == 'wildtype'][feature]
    mut = df_features[df_features['condition'] == 'mutant'][feature]
    stat, p = mannwhitneyu(wild, mut)
    return p

# Plot violin plots for each feature
fig, axes = plt.subplots(5, 2, figsize=(12, 20))
axes = axes.flatten()

for i, feature in enumerate(features):
    p = compute_p_value(feature)
    sns.violinplot(data=df_features, x='condition', y=feature, ax=axes[i], inner='point')
    axes[i].set_title(f'{feature} (p = {p:.2e})')
    axes[i].set_xlabel('Condition')
    axes[i].set_ylabel(feature.capitalize())

plt.tight_layout()
plt.show()