In [1]:
# ============================================================================
# KAGGLE GPU DRIVER NOTEBOOK - Burnout Prediction Training Pipeline
# ============================================================================
# This notebook runs the full training pipeline on Kaggle's GPU environment
# Dataset: Work-Life Balance Synthetic Daily Wellness Dataset
# ============================================================================

## Step 1: Environment Setup
Clone the repository and install dependencies.

In [None]:
# Clone the GitHub repository (if not already cloned)
import os
if not os.path.exists('/kaggle/working/fds-project'):
    !git clone https://github.com/danpinocontrollino/fds-project.git /kaggle/working/fds-project

# Change working directory to the cloned repo (MUST run before any scripts)
%cd /kaggle/working/fds-project

# Verify we're in the right place
print(f"Working directory: {os.getcwd()}")

# Install required libraries (not pre-installed on Kaggle)
!pip install -q joblib pyarrow

Cloning into 'fds-project'...
remote: Enumerating objects: 214, done.[K
remote: Counting objects: 100% (214/214), done.[K
remote: Enumerating objects: 214, done.[K[K
remote: Counting objects: 100% (214/214), done.[K
remote: Compressing objects: 100% (143/143), done.[K
remote: Compressing objects: 100% (143/143), done.[K
remote: Total 214 (delta 82), reused 161 (delta 50), pack-reused 0 (from 0)[K
Receiving objects: 100% (214/214), 8.82 MiB | 22.68 MiB/s, done.
remote: Total 214 (delta 82), reused 161 (delta 50), pack-reused 0 (from 0)[K
Receiving objects: 100% (214/214), 8.82 MiB | 22.68 MiB/s, done.
Resolving deltas: 100% (82/82), done.
Resolving deltas: 100% (82/82), done.
/workspaces/FDS-Project/notebooks/fds-project
/workspaces/FDS-Project/notebooks/fds-project


## Step 2: Data Setup
Copy the Kaggle dataset into the expected `data/raw/` directory.

**Important:** Kaggle input paths are read-only, so we must copy (not symlink) the files.

In [3]:
import os
import shutil
from pathlib import Path

# Kaggle's input path for the dataset
KAGGLE_INPUT = Path("/kaggle/input/worklife-balance-synthetic-daily-wellness-dataset")

# Target directory in the cloned repo
RAW_DIR = Path("data/raw")
RAW_DIR.mkdir(parents=True, exist_ok=True)

# Copy all CSV files from Kaggle input to data/raw/
print("Copying dataset files from Kaggle input...")
for csv_file in KAGGLE_INPUT.glob("*.csv"):
    dest = RAW_DIR / csv_file.name
    shutil.copy(csv_file, dest)
    print(f"  ✓ {csv_file.name} -> {dest}")

# Verify the files were copied
print(f"\nFiles in {RAW_DIR}:")
for f in RAW_DIR.iterdir():
    print(f"  - {f.name} ({f.stat().st_size / 1024:.1f} KB)")

Copying dataset files from Kaggle input...

Files in data/raw:


## Step 3: Pipeline Execution
Run the preprocessing scripts to generate burnout labels and prepare features.

In [4]:
# Step 3a: Generate burnout labels from raw data
print("=" * 60)
print("Running: create_burnout_labels.py")
print("=" * 60)
!python scripts/create_burnout_labels.py

Running: create_burnout_labels.py


Traceback (most recent call last):
  File "/workspaces/FDS-Project/notebooks/fds-project/scripts/create_burnout_labels.py", line 209, in <module>
    main()
  File "/workspaces/FDS-Project/notebooks/fds-project/scripts/create_burnout_labels.py", line 187, in main
    weekly, daily = load_raw_frames()
                    ^^^^^^^^^^^^^^^^^
  File "/workspaces/FDS-Project/notebooks/fds-project/scripts/create_burnout_labels.py", line 68, in load_raw_frames
    weekly = pd.read_csv(RAW_DIR / "weekly_summaries.csv", parse_dates=["week_start"])
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/FDS-Project/.venv/lib/python3.12/site-packages/pandas/io/parsers/readers.py", line 1026, in read_csv
    return _read(filepath_or_buffer, kwds)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/FDS-Project/.venv/lib/python3.12/site-packages/pandas/io/parsers/readers.py", line 620, in _read
    parser = TextFileReader(filepath_or_buffe

In [None]:
# Step 3b: Preprocess features for MLP model
print("=" * 60)
print("Running: preprocess.py")
print("=" * 60)
!python scripts/preprocess.py

# Create models directory for saving (ensure it exists before training)
from pathlib import Path
Path("models/saved").mkdir(parents=True, exist_ok=True)
print(f"\n✓ Created models/saved directory")

Running: preprocess.py


Traceback (most recent call last):
  File "/workspaces/FDS-Project/notebooks/fds-project/scripts/preprocess.py", line 364, in <module>
    main()
  File "/workspaces/FDS-Project/notebooks/fds-project/scripts/preprocess.py", line 348, in main
    ensure_inputs_exist()
  File "/workspaces/FDS-Project/notebooks/fds-project/scripts/preprocess.py", line 123, in ensure_inputs_exist
    raise FileNotFoundError(
FileNotFoundError: Missing processed parquet files. Run scripts/create_burnout_labels.py first: data/processed/daily_with_burnout.parquet, data/processed/weekly_with_burnout.parquet


## Step 4: Model Training (Forecasting Mode)
Train all models using GPU acceleration.

**IMPORTANT: All sequence models now predict NEXT WEEK's burnout (7 days ahead).**
This prevents data leakage and enables realistic early warning.

**Models:**
- MLP (Multi-Layer Perceptron) - tabular baseline (same-week, for comparison)
- LSTM (Long Short-Term Memory) - forecasting 7 days ahead
- GRU (Gated Recurrent Unit) - forecasting 7 days ahead
- Transformer - forecasting 7 days ahead

**Configuration:**
- Window: 7 days (weekly patterns)
- Forecast Horizon: 7 days (predict next week's burnout)
- Epochs: 40
- Sample Users: 100% (use full dataset with GPU)

In [6]:
# Check GPU availability
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

PyTorch version: 2.9.1+cpu
CUDA available: False


In [None]:
# Train MLP model (tabular baseline)
%cd /kaggle/working/fds-project
print("=" * 60)
print("Training: MLP Classifier (Tabular)")
print("=" * 60)
!python scripts/train_mlp.py --epochs 40

In [None]:
# Train LSTM model for Burnout (forecasting 7 days ahead)
%cd /kaggle/working/fds-project
print("=" * 60)
print("Training: LSTM Burnout Predictor (7 Days Ahead)")
print("=" * 60)
!python scripts/train_lstm.py --model lstm --target burnout --window 7 --epochs 40 --sample-users 1.0 --forecast-horizon 7

Training: Transformer Sequence Model


usage: train_lstm.py [-h] [--model {lstm,gru,cnn}] [--window WINDOW]
                     [--epochs EPOCHS] [--batch-size BATCH_SIZE] [--lr LR]
                     [--sample-users SAMPLE_USERS]
train_lstm.py: error: argument --model: invalid choice: 'transformer' (choose from lstm, gru, cnn)


In [None]:
# Train GRU model for Burnout (forecasting 7 days ahead)
%cd /kaggle/working/fds-project
print("=" * 60)
print("Training: GRU Burnout Predictor (7 Days Ahead)")
print("=" * 60)
!python scripts/train_lstm.py --model gru --target burnout --window 7 --epochs 40 --sample-users 1.0 --forecast-horizon 7

In [None]:
# Train Transformer model for Burnout (forecasting 7 days ahead)
%cd /kaggle/working/fds-project
print("=" * 60)
print("Training: Transformer Burnout Predictor (7 Days Ahead)")
print("=" * 60)
!python scripts/train_transformer.py --target burnout --window 7 --epochs 40 --sample-users 1.0 --forecast-horizon 7

## Step 5: MAE Pre-training & Fine-tuning
Train a Masked Autoencoder (self-supervised) then fine-tune for classification.

**MAE learns behavioral patterns by reconstructing masked days.**

In [None]:
# Step 5a: MAE Pre-training (self-supervised)
%cd /kaggle/working/fds-project
print("=" * 60)
print("MAE Pre-training: Learning behavioral patterns")
print("=" * 60)
!python scripts/train_mae.py --epochs 50 --sample-users 1.0

In [None]:
# Step 5b: Fine-tune MAE for classification (forecasting mode)
%cd /kaggle/working/fds-project
print("=" * 60)
print("MAE Fine-tuning: Transfer learning for burnout classification")
print("=" * 60)
!python scripts/train_mae_classifier.py --epochs 30 --sample-users 1.0 --forecast-horizon 7

## Step 6: CVAE Smart Advisor
Train a Conditional VAE that can suggest lifestyle changes to reduce burnout.

**Generates "counterfactual" schedules: "What would your week look like with low burnout?"**

In [None]:
# Train CVAE Smart Advisor
%cd /kaggle/working/fds-project
print("=" * 60)
print("Training: CVAE Smart Advisor (Generative Model)")
print("=" * 60)
!python scripts/train_cvae.py --epochs 100 --sample-users 1.0

## Step 7: Focus/Deep Work Prediction
Train models to predict tomorrow's focus level based on the last 7 days.

**Focus prediction has stronger correlates than burnout:**
- meetings_count: -0.31 correlation (fewer meetings = better focus)
- stress_level: -0.20 correlation
- sleep_hours: +0.16 correlation

**Target: focus_level (Low/Medium/High based on focus_score 1-10)**

In [None]:
# Train LSTM for Focus prediction (next day)
%cd /kaggle/working/fds-project
print("=" * 60)
print("Training: LSTM Focus Predictor (Next Day)")
print("=" * 60)
!python scripts/train_lstm.py --model lstm --target focus --window 7 --epochs 40 --sample-users 1.0 --forecast-horizon 1

In [None]:
# Train Transformer for Focus prediction (next day)
%cd /kaggle/working/fds-project
print("=" * 60)
print("Training: Transformer Focus Predictor (Next Day)")
print("=" * 60)
!python scripts/train_transformer.py --target focus --window 7 --epochs 40 --sample-users 1.0 --forecast-horizon 1

## Step 7: Save Models to Kaggle Output
Copy all trained models to `/kaggle/working/` so they persist after the session.

In [None]:
# Copy trained models to Kaggle's output directory
import shutil
from pathlib import Path
import os

# Use absolute path - the repo is cloned to /kaggle/working/fds-project
REPO_DIR = Path("/kaggle/working/fds-project")
MODEL_DIR = REPO_DIR / "models" / "saved"

print(f"Looking for models in: {MODEL_DIR}")
print(f"Current working directory: {os.getcwd()}")

# Create the output directory
OUTPUT_DIR = Path("/kaggle/working")

# Check if models exist
if MODEL_DIR.exists():
    model_files = list(MODEL_DIR.glob("*.pt"))
    if model_files:
        print(f"\nFound {len(model_files)} model files:")
        for model_file in model_files:
            dest = OUTPUT_DIR / model_file.name
            shutil.copy(model_file, dest)
            print(f"  ✓ Copied: {model_file.name} -> {dest}")
        print(f"\n✅ Models saved! Download from Kaggle's 'Output' tab.")
    else:
        print("❌ No .pt files found in models/saved/")
        print("   The training scripts may not have saved properly.")
else:
    print(f"❌ Directory does not exist: {MODEL_DIR}")
    print("\nSearching for .pt files anywhere in working directory...")
    !find /kaggle/working -name "*.pt" -type f 2>/dev/null
    
    print("\nDirectory structure:")
    !ls -la /kaggle/working/fds-project/models/ 2>/dev/null || echo "models/ dir not found"

PermissionError: [Errno 13] Permission denied: '/kaggle'