In [1]:
import os
import re
import json
import logging
import joblib
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from tqdm import tqdm
from collections import defaultdict
from imblearn.over_sampling import SMOTE

# -------------------- Configuration -------------------- #
CONFIG = {
    "data_dir": r"D:\CLASS NOTES\EPICS\Model Dataset\Extracted_features",
    "model_save_path": "final_model.pth",
    "scaler_save_path": "final_scaler.joblib",
    "feature_dims": {
        "spatial": 36,    # 12 body joints * 3 coordinates (x,y,z)
        "temporal": 10,   # 5 body parts * 2 features (velocity, acceleration)
        "tracked": 3,     # 3 interaction metrics
        "group": 1        # 1 group metric
    },
    "interaction_weights": {
        "tracked": 2.5,
        "group": 3.0
    },
    "batch_size": 32,
    "num_epochs": 100,
    "early_stop_patience": 10,
    "hidden_size": 512,
    "learning_rate": 1e-4,
    "class_pattern": r"^([A-Za-z]+)\d+",
    "min_samples_per_class": 2,
    "smote_sampling_strategy": "not minority"
}

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# -------------------- Feature Processing -------------------- #
def parse_features(feature_str):
    """Recursively flatten nested JSON structures to float values"""
    try:
        data = json.loads(feature_str)
        
        def flatten(value):
            if isinstance(value, dict):
                return [item for v in value.values() for item in flatten(v)]
            if isinstance(value, list):
                return [item for sublist in value for item in flatten(sublist)]
            try:
                return [float(value)]
            except:
                return []
        
        return flatten(data)
    except Exception as e:
        logger.debug(f"Parse error in '{feature_str[:50]}...': {str(e)}")
        return []

def pad_features(features, target_length):
    """Pad features with zeros to target length"""
    return features[:target_length] + [0.0] * max(target_length - len(features), 0)

# -------------------- Dataset Class -------------------- #
class AnomalyDataset(Dataset):
    def __init__(self, features, labels, scaler=None):
        # Filter empty entries
        valid_indices = [i for i, f in enumerate(features) if len(f) > 0]
        features = [features[i] for i in valid_indices]
        labels = [labels[i] for i in valid_indices]
        
        # Normalization
        self.scaler = scaler or StandardScaler()
        if scaler is None:
            self.scaler.fit(features)
        
        self.features = self.scaler.transform(features)
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return (
            torch.FloatTensor(self.features[idx]).to(device),
            torch.LongTensor([self.labels[idx]]).to(device)
        )

# -------------------- Model Architecture -------------------- #
class InteractionAwareModel(nn.Module):
    def __init__(self, input_size, num_classes):
        super().__init__()
        self.main = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, num_classes)
        )
    
    def forward(self, x):
        return self.main(x)

# -------------------- Data Loading Pipeline -------------------- #
def load_and_process_data():
    """Full data processing pipeline with error handling"""
    feature_files = [os.path.join(CONFIG['data_dir'], f) 
                   for f in os.listdir(CONFIG['data_dir']) if f.endswith('.csv')]
    
    # Class analysis and filtering
    class_counts = defaultdict(int)
    valid_files = []
    for f in feature_files:
        try:
            cls_name = re.match(CONFIG['class_pattern'], os.path.basename(f)).group(1).lower()
            class_counts[cls_name] += 1
            valid_files.append(f)
        except:
            continue
    
    valid_classes = [cls for cls, count in class_counts.items() 
                    if count >= CONFIG['min_samples_per_class']]
    
    # Feature extraction
    all_features = []
    all_labels = []
    label_mapping = {cls: idx for idx, cls in enumerate(sorted(valid_classes))}
    
    for f in tqdm(valid_files, desc="Processing files"):
        try:
            cls_name = re.match(CONFIG['class_pattern'], os.path.basename(f)).group(1).lower()
            if cls_name not in valid_classes:
                continue
            
            df = pd.read_csv(f)
            label = label_mapping[cls_name]
            
            for _, row in df.iterrows():
                # Parse and pad features
                spatial = pad_features(parse_features(row['spatial']), CONFIG['feature_dims']['spatial'])
                temporal = pad_features(parse_features(row['temporal']), CONFIG['feature_dims']['temporal'])
                tracked = pad_features(parse_features(row['tracked']), CONFIG['feature_dims']['tracked'])
                group = pad_features(parse_features(row['group']), CONFIG['feature_dims']['group'])
                
                # Apply interaction weights
                interaction_feats = [
                    tracked[0] * CONFIG['interaction_weights']['tracked'],
                    tracked[1],
                    tracked[2],
                    group[0] * CONFIG['interaction_weights']['group']
                ]
                
                # Combine features
                combined = np.concatenate([spatial, temporal, interaction_feats])
                all_features.append(combined)
                all_labels.append(label)
                
        except Exception as e:
            logger.error(f"Error processing {os.path.basename(f)}: {str(e)}")
            continue
    
    return np.array(all_features), np.array(all_labels), label_mapping

# -------------------- Training Pipeline -------------------- #
def train():
    # Load and validate data
    X, y, label_mapping = load_and_process_data()
    if len(X) == 0:
        raise ValueError("No valid features extracted. Verify data files and parsing logic.")
    
    # Balance classes
    smote = SMOTE(sampling_strategy=CONFIG['smote_sampling_strategy'], random_state=42)
    X_res, y_res = smote.fit_resample(X, y)
    
    # Split dataset
    X_train, X_val, y_train, y_val = train_test_split(
        X_res, y_res, 
        test_size=0.2, 
        stratify=y_res,
        random_state=42
    )
    
    # Create datasets
    train_dataset = AnomalyDataset(X_train, y_train)
    val_dataset = AnomalyDataset(X_val, y_val, train_dataset.scaler)
    
    # Save artifacts
    joblib.dump(train_dataset.scaler, CONFIG['scaler_save_path'])
    joblib.dump(label_mapping, 'label_mapping.joblib')
    
    # Initialize model
    model = InteractionAwareModel(
        input_size=X.shape[1],
        num_classes=len(label_mapping)
    ).to(device)
    optimizer = optim.AdamW(model.parameters(), lr=CONFIG['learning_rate'])
    
    # Class-weighted loss
    class_counts = np.bincount(y_train)
    weights = torch.FloatTensor(1.0 / (class_counts + 1e-9)).to(device)
    criterion = nn.CrossEntropyLoss(weight=weights)
    
    # Training loop
    best_auc = 0
    no_improve = 0
    train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=CONFIG['batch_size'])
    
    for epoch in range(CONFIG['num_epochs']):
        model.train()
        train_loss = 0
        
        for features, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels.squeeze())
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            train_loss += loss.item()
        
        # Validation
        model.eval()
        val_preds, val_labels = [], []
        with torch.no_grad():
            for features, labels in val_loader:
                outputs = model(features)
                val_preds.extend(torch.softmax(outputs, 1).cpu().numpy())
                val_labels.extend(labels.cpu().numpy())
        
        # Calculate metrics
        val_labels = np.array(val_labels)
        val_preds = np.array(val_preds)

        logger.info(f"Prediction shape: {val_preds.shape}, Label shape: {val_labels.shape}")

        metrics = {
            'acc': accuracy_score(val_labels, np.argmax(val_preds, axis=1)),
            'f1': f1_score(val_labels, np.argmax(val_preds, axis=1), average='weighted'),
            'auc': roc_auc_score(val_labels, val_preds, multi_class='ovr', average='weighted')  # Changed to 'ovr'
        }
        
        # Early stopping
        if metrics['auc'] > best_auc:
            best_auc = metrics['auc']
            no_improve = 0
            torch.save(model.state_dict(), CONFIG['model_save_path'])
        else:
            no_improve += 1
            if no_improve >= CONFIG['early_stop_patience']:
                logger.info("Early stopping triggered")
                break
        
        logger.info(
            f"Epoch {epoch+1} | Train Loss: {train_loss/len(train_loader):.4f} | "
            f"Val Acc: {metrics['acc']:.4f} | Val F1: {metrics['f1']:.4f} | Val AUC: {metrics['auc']:.4f}"
        )
    
    logger.info(f"Training complete. Best validation AUC: {best_auc:.4f}")

if __name__ == "__main__":
    train()

Processing files: 100%|██████████| 67/67 [00:04<00:00, 15.94it/s]
Epoch 1: 100%|██████████| 780/780 [00:03<00:00, 221.77it/s]
INFO:__main__:Prediction shape: (6237, 8), Label shape: (6237, 1)
INFO:__main__:Epoch 1 | Train Loss: 1.4473 | Val Acc: 0.6973 | Val F1: 0.6911 | Val AUC: 0.9240
Epoch 2: 100%|██████████| 780/780 [00:03<00:00, 253.16it/s]
INFO:__main__:Prediction shape: (6237, 8), Label shape: (6237, 1)
INFO:__main__:Epoch 2 | Train Loss: 0.9609 | Val Acc: 0.7335 | Val F1: 0.7302 | Val AUC: 0.9590
Epoch 3: 100%|██████████| 780/780 [00:03<00:00, 238.28it/s]
INFO:__main__:Prediction shape: (6237, 8), Label shape: (6237, 1)
INFO:__main__:Epoch 3 | Train Loss: 0.8024 | Val Acc: 0.8164 | Val F1: 0.8219 | Val AUC: 0.9712
Epoch 4: 100%|██████████| 780/780 [00:03<00:00, 222.80it/s]
INFO:__main__:Prediction shape: (6237, 8), Label shape: (6237, 1)
INFO:__main__:Epoch 4 | Train Loss: 0.6928 | Val Acc: 0.8581 | Val F1: 0.8626 | Val AUC: 0.9790
Epoch 5: 100%|██████████| 780/780 [00:03<00:00

In [None]:
# In the feature processing loop:
tracked = parse_features(row['tracked'])
group = parse_features(row['group'])

# Convert to floats and validate
try:
    tracked = [float(x) for x in tracked]
    group = [float(x) for x in group]
except ValueError as e:
    logger.error(f"Non-numeric value in {f}: {str(e)}")
    continue

interaction_feats = [
    tracked[0] * CONFIG['interaction_weights']['tracked'],
    tracked[1],  # Already converted to float
    tracked[2],
    group[0] * CONFIG['interaction_weights']['group']
]