In [None]:
# NICU Data Exploratory Analysis
# Notebook for exploring NICU kangaroo care data patterns

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys
sys.path.append('..')

from src.data.generator import generate_nicu_data

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [None]:
# Generate or load data
data_path = Path('../data/processed')

if (data_path / 'train.csv').exists():
   train_df = pd.read_csv(data_path / 'train.csv')
   val_df = pd.read_csv(data_path / 'val.csv')
   test_df = pd.read_csv(data_path / 'test.csv')
   print("Loaded existing data")
else:
   print("Generating new data...")
   train_df, val_df, test_df = generate_nicu_data(n_samples=2000)
   
print(f"Train: {len(train_df)} samples")
print(f"Val: {len(val_df)} samples")
print(f"Test: {len(test_df)} samples")

In [None]:
# Basic statistics
train_df.describe()

In [None]:
# Class distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

# Pie chart
train_df['suitable_for_kangaroo_care'].value_counts().plot.pie(
   autopct='%1.1f%%', 
   labels=['Not Suitable', 'Suitable'],
   ax=ax1
)
ax1.set_title('Class Distribution')

# Bar chart
pd.concat([
   train_df['suitable_for_kangaroo_care'].value_counts().rename('Train'),
   val_df['suitable_for_kangaroo_care'].value_counts().rename('Val'),
   test_df['suitable_for_kangaroo_care'].value_counts().rename('Test')
], axis=1).plot.bar(ax=ax2)
ax2.set_title('Class Distribution by Dataset')
ax2.set_xticklabels(['Not Suitable', 'Suitable'], rotation=0)

plt.tight_layout()
plt.show()

In [None]:
# Feature distributions
feature_cols = ['heart_rate', 'oxygen_saturation', 'respiratory_rate', 'weight_grams', 'temperature_celsius']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for i, col in enumerate(feature_cols):
   train_df[col].hist(bins=50, ax=axes[i], alpha=0.7)
   axes[i].set_title(f'Distribution of {col}')
   axes[i].set_xlabel(col)
   axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Feature distributions by class
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for i, col in enumerate(feature_cols):
   suitable = train_df[train_df['suitable_for_kangaroo_care'] == True][col]
   not_suitable = train_df[train_df['suitable_for_kangaroo_care'] == False][col]
   
   axes[i].hist(suitable, bins=30, alpha=0.5, label='Suitable', density=True)
   axes[i].hist(not_suitable, bins=30, alpha=0.5, label='Not Suitable', density=True)
   axes[i].set_title(f'{col} by Suitability')
   axes[i].set_xlabel(col)
   axes[i].set_ylabel('Density')
   axes[i].legend()

plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix
plt.figure(figsize=(10, 8))
corr_matrix = train_df[feature_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
           square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix')
plt.show()

In [None]:
# Pairplot for feature relationships
sample_df = train_df.sample(n=500, random_state=42)  # Sample for performance
sns.pairplot(sample_df[feature_cols + ['suitable_for_kangaroo_care']], 
            hue='suitable_for_kangaroo_care', 
            diag_kind='kde',
            plot_kws={'alpha': 0.6})
plt.suptitle('Feature Relationships by Suitability', y=1.02)
plt.show()

In [None]:
# Box plots for each feature by suitability
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for i, col in enumerate(feature_cols):
   train_df.boxplot(column=col, by='suitable_for_kangaroo_care', ax=axes[i])
   axes[i].set_title(f'{col} by Suitability')
   axes[i].set_xlabel('Suitable for Kangaroo Care')
   axes[i].set_ylabel(col)

plt.suptitle('')  # Remove automatic suptitle
plt.tight_layout()
plt.show()

In [None]:
# Feature importance based on mean differences
feature_importance = {}

for col in feature_cols:
   suitable_mean = train_df[train_df['suitable_for_kangaroo_care'] == True][col].mean()
   not_suitable_mean = train_df[train_df['suitable_for_kangaroo_care'] == False][col].mean()
   
   # Normalize by feature scale
   feature_std = train_df[col].std()
   importance = abs(suitable_mean - not_suitable_mean) / feature_std
   feature_importance[col] = importance

# Plot feature importance
plt.figure(figsize=(10, 6))
features = list(feature_importance.keys())
importances = list(feature_importance.values())
plt.bar(features, importances)
plt.title('Feature Importance (Normalized Mean Difference)')
plt.xlabel('Features')
plt.ylabel('Importance Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Statistical tests for feature differences
from scipy import stats

print("Statistical significance of features (t-test):")
print("-" * 50)

for col in feature_cols:
   suitable = train_df[train_df['suitable_for_kangaroo_care'] == True][col]
   not_suitable = train_df[train_df['suitable_for_kangaroo_care'] == False][col]
   
   t_stat, p_value = stats.ttest_ind(suitable, not_suitable)
   print(f"{col:20s}: t={t_stat:7.3f}, p={p_value:.3e}")

In [None]:
# Identify edge cases and outliers
print("\nEdge cases analysis:")
print("-" * 50)

# Define edge case conditions
edge_cases = train_df[
   ((train_df['heart_rate'] < 60) | (train_df['heart_rate'] > 180)) |
   ((train_df['oxygen_saturation'] < 85)) |
   ((train_df['respiratory_rate'] < 25) | (train_df['respiratory_rate'] > 70)) |
   ((train_df['weight_grams'] < 1000)) |
   ((train_df['temperature_celsius'] < 35.5) | (train_df['temperature_celsius'] > 38.0))
]

print(f"Total edge cases: {len(edge_cases)} ({len(edge_cases)/len(train_df)*100:.1f}%)")
print(f"Edge cases suitable: {edge_cases['suitable_for_kangaroo_care'].sum()} ({edge_cases['suitable_for_kangaroo_care'].mean()*100:.1f}%)")
print(f"Normal cases suitable: {train_df[~train_df.index.isin(edge_cases.index)]['suitable_for_kangaroo_care'].mean()*100:.1f}%")