This notebook covers:

1. Loading tabular data with pandas
2. Data validation and quality checks
3. Basic data exploration and statistics
4. Data visualization and relationships

The data format follows these requirements:
- Each row represents a unique sample with a distinct sample_id
- Features are numeric values used for classification
- Labels represent classification groups (can be repeated across samples)


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import toml
from typing import Dict, Any

# Configure pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

def load_config(config_path: str) -> Dict[str, Any]:
    """
    Load configuration from a TOML file.
    
    Args:
        config_path: Path to the TOML configuration file
        
    Returns:
        Dictionary containing the configuration parameters
    """
    try:
        config = toml.load(config_path)
        print("Configuration loaded successfully!")
        return config
    except Exception as e:
        print(f"Error loading configuration: {str(e)}")
        raise

# Load configuration
config = load_config(Path('../config.toml'))

# Apply visualization settings from config
if 'visualization' in config:
    if 'dpi' in config['visualization']:
        plt.rcParams['figure.dpi'] = config['visualization']['dpi']
    if 'figure_size' in config['visualization']:
        plt.rcParams['figure.figsize'] = config['visualization']['figure_size']
    if 'color_palette' in config['visualization']:
        sns.set_palette(config['visualization']['color_palette'])


## 1. Load Data

The data structure follows these rules:
- Each row represents a unique sample with a distinct `sample_id`
- Features are numeric values for classification
- `labels` can be repeated as they represent classification groups

Let's load and examine this data:


In [None]:

# Load the data using config
data_config = config['data']
data_path = data_config['input_path']
df = pd.read_csv(data_path)

# Get column names from config
sample_id_col = data_config['sample_id_column']
label_col = data_config['label_column']

# Get feature columns
if data_config['feature_columns']:
    # Use explicitly specified feature columns
    feature_cols = data_config['feature_columns']
else:
    # Auto-detect feature columns (all numeric columns except sample_id and label)
    feature_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if sample_id_col in feature_cols:
        feature_cols.index = df[sample_id_col]
        feature_cols.remove(sample_id_col)
    if label_col in feature_cols:
        feature_cols.remove(label_col)

# Display basic information about the dataset
print("Dataset Info:")
print("-" * 40)
print(f"Number of samples: {len(df)}")
print(f"Number of features: {len(feature_cols)}")
print(f"Feature names: {feature_cols}")
print(f"Number of unique samples: {df[sample_id_col].nunique()}")
print(f"Unique labels: {df[label_col].unique()}")
print(f"Label distribution: {df[label_col].value_counts().to_dict()}")

# Basic data validation
print("\nData Validation:")
print("-" * 40)

# Check for duplicate sample IDs
if len(df) != df[sample_id_col].nunique():
    print("\nWARNING: Duplicate sample IDs found!")
    duplicates = df[df[sample_id_col].duplicated(keep=False)]
    print(duplicates[[sample_id_col, label_col]])

# Check for missing values
missing_values = df.isnull().sum()
if missing_values.any():
    print("\nWARNING: Missing values found:")
    print(missing_values[missing_values > 0])


Log transformation

In [None]:
if config['preprocessing']['log_transform'] == "log2":
    df[feature_cols] = np.log2(df[feature_cols]+1)
    print("Log2 transformation applied")
elif config['preprocessing']['log_transform'] == "log10":
    df[feature_cols] = np.log10(df[feature_cols]+1)
    print("Log10 transformation applied")
elif config['preprocessing']['log_transform'] == "ln":
    df[feature_cols] = np.log(df[feature_cols]+1)
    print("Natural logarithm transformation applied")
else:
    print("No log transformation applied")

## 2. Data Validation

Let's perform some basic validation checks on the input data:
1. Check for missing values
2. Verify data types (numeric features, categorical labels)
3. Check value ranges and distributions
4. Validate label categories


In [None]:
# 1. Check for missing values
print("Missing Values Check:")
print("-" * 40)
print(df.isnull().sum())

# 2. Verify data types
print("\nData Types Check:")
print("-" * 40)
print(df.dtypes)

# 3. Validate sample_ids
print("\nSample ID Validation:")
print("-" * 40)
print(f"Number of sample_ids: {len(df['sample_id'])}")
print(f"Number of unique sample_ids: {df['sample_id'].nunique()}")
if len(df['sample_id']) != df['sample_id'].nunique():
    print("WARNING: Duplicate sample_ids found!")
    print("\nDuplicate sample_ids:")
    print(df['sample_id'].value_counts()[df['sample_id'].value_counts() > 1])

# 4. Check value ranges for numeric features
print("\nValue Ranges for Features:")
print("-" * 40)
for col in df.columns[:5]:  # Exclude sample_id and label columns
    if col not in [sample_id_col, label_col]:
        print(f"{col}:")
        print(f"  Min: {df[col].min():.3f}")
        print(f"  Max: {df[col].max():.3f}")
        print(f"  Mean: {df[col].mean():.3f}")
        print(f"  Std: {df[col].std():.3f}")

# 5. Validate labels
print("\nLabel Distribution:")
print("-" * 40)
print("Label counts:")
print(df['label'].value_counts())
print("\nSamples per label:")
for label in df['label'].unique():
    print(f"\nLabel {label}:")
    print(df[df['label'] == label]['sample_id'].tolist())


## 3. Data Visualization

### Label distribution

In [None]:
# Set default visualization parameters
default_fig_size = (15, 10)
default_color_palette = 'viridis'

# Get visualization settings from config if available
viz_config = config.get('visualization', {})
fig_size = viz_config.get('figure_size', default_fig_size)
color_palette = viz_config.get('color_palette', default_color_palette)

# Set color palette
sns.set_palette(color_palette)

# Function to select top features by variance
def select_top_features(data, feature_cols, n=100):
    """Select top N features by variance."""
    variances = data[feature_cols].var().sort_values(ascending=False)
    return variances.head(n).index.tolist()

# Function to create feature correlation matrix
def compute_feature_correlations(data, features):
    """Compute correlation matrix for selected features."""
    return data[features].corr()

# 1. Label Distribution Analysis
plt.figure(figsize=fig_size)
plt.title('Label Distribution', pad=20, fontsize=14)
sns.countplot(data=df, x=label_col)
plt.xticks(rotation=45)
plt.xlabel('Label')
plt.ylabel('Count')

# Add percentage labels on top of bars
total = len(df)
for p in plt.gca().patches:
    percentage = f'{100 * p.get_height() / total:.1f}%'
    plt.annotate(percentage, (p.get_x() + p.get_width()/2., p.get_height()),
                ha='center', va='bottom')

plt.tight_layout()
plt.show()



In [None]:
# 2. Feature Distribution Heatmap
print("\nGenerating Feature Distribution Heatmap...")
# Select top features by variance if there are too many
n_features = len(feature_cols)
if n_features > 100:
    print(f"Selecting top 100 features by variance out of {n_features} total features")
    selected_features = select_top_features(df, feature_cols, n=100)
else:
    selected_features = feature_cols

# Create feature distribution heatmap
plt.figure(figsize=(15, 8))
plt.title('Feature Distribution Heatmap (Normalized Values)', pad=20, fontsize=14)

# Normalize and plot feature values
feature_data = df[selected_features].apply(lambda x: (x - x.mean()) / x.std())
sns.heatmap(feature_data.T, cmap='coolwarm', center=0, 
            yticklabels=True, xticklabels=False)
plt.xlabel('Samples')
plt.ylabel('Features')
plt.tight_layout()
plt.show()

### Feature correlations

In [None]:
# 3. Feature Correlation Analysis
print("\nGenerating Feature Correlation Analysis...")
# Compute correlations for selected features
correlation_matrix = compute_feature_correlations(df, selected_features)

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
plt.title('Feature Correlation Heatmap', pad=20, fontsize=14)
mask = np.triu(np.ones_like(correlation_matrix), k=1)
sns.heatmap(correlation_matrix, mask=mask, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": .5})
plt.tight_layout()
plt.show()

### PCA

In [None]:
# 4. PCA Analysis
print("\nPerforming PCA Analysis...")
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
# Prepare data for PCA
# Impute missing values before scaling and PCA
imputer = SimpleImputer(strategy="mean")  # or "median", etc.
X_imputed = imputer.fit_transform(df[feature_cols])
X_scaled = StandardScaler().fit_transform(X_imputed)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Create PCA plot
plt.figure(figsize=(10, 8))
plt.title('PCA of Features by Label', pad=20, fontsize=14)

# Create scatter plot with labels
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], 
                     c=pd.factorize(df[label_col])[0],
                     cmap='viridis', alpha=0.6)

# Add legend
unique_labels = df[label_col].unique()
legend_elements = [plt.Line2D([0], [0], marker='o', color='w', 
                            markerfacecolor=scatter.cmap(scatter.norm(i)), 
                            label=label, markersize=10)
                  for i, label in enumerate(unique_labels)]
plt.legend(handles=legend_elements, title='Labels')

plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
plt.tight_layout()
plt.show()

# Print feature importance summary
print("\nTop Features by Contribution to Principal Components:")
feature_importance = pd.DataFrame(
    pca.components_.T,
    columns=['PC1', 'PC2'],
    index=feature_cols
)
feature_importance['total_importance'] = np.abs(feature_importance).sum(axis=1)
print("\nTop 10 Most Important Features:")
print(feature_importance.sort_values('total_importance', ascending=False).head(10))

In [None]:
# Save processed data and models for next steps
import pickle
from pathlib import Path

# Create directory for saved objects if it doesn't exist
save_dir = Path('../data/processed')
save_dir.mkdir(parents=True, exist_ok=True)

# Dictionary to store all objects we want to save
objects_to_save = {
    'data': {
        'df': df,  # Original dataframe
        'feature_cols': feature_cols  # List of feature columns
    },
    'metadata': {
        'sample_id_col': sample_id_col,
        'label_col': label_col,
        'n_features': len(feature_cols),
        'n_samples': len(df),
        'label_distribution': df[label_col].value_counts().to_dict()
    }
}

output_file = save_dir / f'01_data_ingestion.pkl'
with open(output_file, 'wb') as f:
    pickle.dump(objects_to_save, f)
print(f"Saved objects to {output_file}")