# Day 2: Sweeping with wandb and keras

Purpose of this notebook is to:
* Run a sweep on the kaggle mnist dataset

## Imports

In [1]:
import kaggle
import keras
import torch
import wandb
from wandb.keras import WandbCallback
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

2024-05-06 15:53:07.632334: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


cuda:0


### Check GPU

In [2]:
%load_ext tensorboard
# check for gpu
!nvidia-smi
# check for cuda
!nvcc --version
# use gpu
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)


Mon May  6 15:53:15 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3070 ...    Off |   00000000:01:00.0 Off |                  N/A |
| N/A   39C    P5             24W /   40W |      11MiB /   8192MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

2024-05-06 15:53:15.710906: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-06 15:53:15.713024: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-06 15:53:15.713336: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

##  Download the dataset from kaggle

In [3]:
import os
import zipfile

# Define the directory where you want to download the data
data_dir = "MyDataset/mnist/raw"  # './' represents the current directory

# Check if the directory exists
if not os.path.exists(data_dir):
    # If not, create the directory
    os.makedirs(data_dir)

# Move to that directory
os.chdir(data_dir)
competition_name = "digit-recognizer"
# Download the data
os.system("kaggle competitions download -c " + competition_name)

# Unzip the data
with zipfile.ZipFile("digit-recognizer.zip","r") as zip_ref:
    zip_ref.extractall(".")

os.chdir("../../..")

digit-recognizer.zip: Skipping, found more recently modified local copy (use --force to force download)


## Load Data And prepare the data

In [4]:
import pandas as pd
import numpy as np

# Load the dataset
train_df = pd.read_csv('MyDataset/mnist/raw/train.csv')
test_df = pd.read_csv('MyDataset/mnist/raw/test.csv')

# Split features and labels
y_train = train_df["label"]
x_train = train_df.drop(labels = ["label"], axis = 1)

# Convert to numpy arrays
x_train = x_train.values
y_train = y_train.values

# Test data
x_test = test_df.values



### Inspet the data

In [5]:
train_df.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Prepare the data

In [6]:
# Make Validation set
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.1, random_state=2)

# Scale images to the [0, 1] range
x_train = x_train.astype("float32") / 255
x_test = x_test.astype("float32") / 255
# Make sure images have shape (28, 28, 1)
x_train = x_train.reshape(-1, 28, 28, 1)
x_test = x_test.reshape(-1, 28, 28, 1)
x_val = x_val.reshape(-1, 28, 28, 1)
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print(x_train.shape[0], "train samples")
print(x_test.shape[0], "test samples")

x_train shape: (37800, 28, 28, 1)
y_train shape: (37800,)
37800 train samples
28000 test samples


## Define Configurations

### Config function

In [7]:
import hydra
from omegaconf import OmegaConf
def is_sweep():
    """
    Check if the current run is part of a WandB sweep.
    """
    if wandb.run is None:
        return False
    return wandb.run.sweep_id is not None
def load_and_override_config(config_dir, config_name, manual_overrides={}):
    """
    Load configuration with Hydra, manually override parameters, and integrate with WandB.

    Args:
    - config_dir (str): Directory path where configuration files are stored.
    - config_name (str): Name of the configuration file to load without the extension.
    - manual_overrides (dict): Dictionary of parameters to override manually.

    Returns:
    - OmegaConf.DictConfig: The final configuration object after all overrides.
    """

    # Initialize Hydra and load the base configuration
    # hydra.initialize(config_path=config_dir)
    # cfg = hydra.compose(config_name + ".yaml")
    cfg = OmegaConf.load(f"{config_dir}/{config_name}.yaml")
    
    # Apply manual overrides
    cfg = OmegaConf.merge(cfg, OmegaConf.create(manual_overrides))
    
    
    # Check if running under WandB and apply WandB configuration if it's a sweep
    if wandb.run is not None:
        # Assuming wandb has been initialized outside this function in your main workflow
        wandb_config = wandb.config
        print("wandb_config",wandb_config)
        cfg = OmegaConf.merge(cfg, OmegaConf.create(dict(wandb_config)))

    cfg.is_sweep= is_sweep()
    print("cfg",cfg)
    return cfg


import wandb

def get_or_create_sweep_id(project_name, sweep_config,force_create=False):
    """
    Get or create a sweep ID for the given project.

    This function checks if there is a file named '{project_name}_sweep_id.txt' that contains the sweep ID.
    If the file exists, it reads the sweep ID from the file.
    If the file does not exist, it creates a new sweep and writes the sweep ID to the file.

    Args:
    project_name (str): The name of the project.
    sweep_config (dict): The configuration of the sweep.

    Returns:
    str: The sweep ID.
    """
    sweep_id_folder = 'sweep_ids'
    sweep_id_file = f'{project_name}_sweep_id.txt'
    sweep_id_file = os.path.join(sweep_id_folder, sweep_id_file)
    if force_create:
        sweep_id = wandb.sweep(sweep_config, project=project_name)
        with open(sweep_id_file, 'w') as file:
            file.write(sweep_id)
        return sweep_id
    # Check if the sweep ID file exists
    if os.path.exists(sweep_id_file):
        # If the file exists, read the sweep ID from the file
        with open(sweep_id_file, 'r') as file:
            sweep_id = file.read().strip()
    else:
        # If the file does not exist, create a new sweep
        sweep_id = wandb.sweep(sweep_config, project=project_name)
        # Make sure the directory exists
        os.makedirs(sweep_id_folder, exist_ok=True)
        # Write the sweep ID to the file
        with open(sweep_id_file, 'w') as file:
            file.write(sweep_id)
    
    return sweep_id


### Define Manual Overrides

In [8]:
config_overrides = {
   # 'epochs': 2,
}

## Main Function

In [9]:
%tensorboard --logdir=logs
from wandb.keras import WandbMetricsLogger, WandbEvalCallback
from IPython.display import clear_output
import gc

best_model = None
best_accuracy = 0
cfg = load_and_override_config(".", "config")
def main():
    # Load configuration
    cfg = load_and_override_config(".", "config")
    wandb.init(project=cfg.project_name)
    print(OmegaConf.to_yaml(cfg))
    cfg = load_and_override_config(".", "config")

    wandb.config = OmegaConf.to_container(
        cfg, resolve=True, throw_on_missing=True
    )

    num_classes = 10
    input_shape = (28, 28, 1)

    model = keras.Sequential(
        [
            keras.layers.Input(shape=input_shape),
            keras.layers.Conv2D(int(64*cfg.param_scale), kernel_size=(3, 3), activation="relu"),
            keras.layers.Conv2D(int(64*cfg.param_scale), kernel_size=(3, 3), activation="relu"),
            keras.layers.MaxPooling2D(pool_size=(2, 2)),
            keras.layers.Conv2D(int(128*cfg.param_scale), kernel_size=(3, 3), activation="relu"),
            keras.layers.Conv2D(int(128*cfg.param_scale), kernel_size=(3, 3), activation="relu"),
            keras.layers.GlobalAveragePooling2D(),
            keras.layers.Dropout(cfg.dropout_rate),
            keras.layers.Dense(num_classes, activation="softmax"),
        ]
    )
    model.summary()

    model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(),
    optimizer=keras.optimizers.Adam(learning_rate=cfg.learning_rate),
    metrics=[
        keras.metrics.SparseCategoricalAccuracy(name="acc"),
    ],
    )

    callbacks = [ 
    keras.callbacks.EarlyStopping(monitor="val_acc",patience=cfg.patience, verbose=1, restore_best_weights=True),
    keras.callbacks.ModelCheckpoint(cfg.best_model_path, save_best_only=True),
    keras.callbacks.TensorBoard(log_dir="./logs"), 
    WandbMetricsLogger(),
    ]

    model.fit(
        x_train,
        y_train,
        batch_size=cfg.batch_size,
        epochs=cfg.epochs,
        callbacks=callbacks,
        validation_data=(x_val, y_val),

    )

        # Log the best model as an artifact
    artifact = wandb.Artifact('best-model', type='model')
    artifact.add_file(cfg.best_model_path)
    wandb.log_artifact(artifact)
    wandb.finish()
    del model
    gc.collect()
    clear_output(wait=True)

Reusing TensorBoard on port 6006 (pid 113549), started 17:40:46 ago. (Use '!kill 113549' to kill it.)

cfg {'project_name': 'Day2', 'username': 'frizzerdk', 'epochs': 50, 'param_scale': 1.0, 'dropout_rate': 0.5, 'learning_rate': 0.001, 'batch_size': 128, 'patience': 20, 'is_sweep': False, 'checkpoint_path': './checkpoints', 'best_model_path': '${checkpoint_path}/best_model.keras'}


## Run

In [None]:
do_sweep = True

sweep_config = {
    'method': 'grid',
    'metric': {'name': 'loss', 'goal': 'minimize'},
    'parameters': {
        'learning_rate': {'values': [ 0.001, 3e-4, 0.0001]},
        'batch_size': {'values': [16, 32, 64]},
        'param_scale': {'values': [0.5, 1, 2]},
        'dropout_rate': {'values': [0.0,0.1, 0.5, 0.9]},
    }
}
sweep_id = get_or_create_sweep_id(cfg.project_name, sweep_config)

if do_sweep:
    wandb.agent(sweep_id, project=cfg.project_name, function=main)
else:
    main()

#wandb.finish()

## Model prediction

In [16]:
import wandb
import os

import wandb
import os
import wandb
import os

def cleanup_and_save_top_models(project_name, username, sweep_id, top_x, sort_metric="epoch/val_acc", artifact_name="best-model", delete_other=False, local_save_path="./best_models"):
    """
    Identifies the top X best runs from a Weights & Biases sweep,
    deletes artifacts from the other runs, saves the top models locally for evaluation, and saves the best model overall as `overall_best_model`.

    Args:
        project_name (str): The name of the wandb project.
        username (str): Your wandb username.
        sweep_id (str): The sweep ID containing the runs.
        top_x (int): The number of best runs to retain.
        sort_metric (str): The metric name to use for sorting the best runs.
        artifact_name (str): The name of the model artifact to save or delete.
        delete_other (bool): Whether to delete artifacts that aren't in the top X.
        local_save_path (str): Path where the top models will be saved locally.
    """
    # Initialize the wandb API
    api = wandb.Api()

    # Construct the project path
    project_path = f"{username}/{project_name}"

    # Fetch all runs associated with the specified project and sweep
    runs = api.runs(path=project_path, filters={"sweep": sweep_id})

    # Sort runs by the specified metric, defaulting to 0 if the metric isn't found
    sorted_runs = sorted(runs, key=lambda run: run.summary.get(sort_metric, 0), reverse=True)
    print(f"Found {len(sorted_runs)} runs in the sweep.")

    # Identify the top X runs
    top_runs = sorted_runs[:top_x]
    print(f"Identified the top {top_x} runs.")

    # Get the best overall run
    best_overall_run = sorted_runs[0]
    print(f"Best overall run: {best_overall_run.name}")

    # Create a set of run IDs to keep
    top_run_ids = {run.id for run in top_runs}
    print(f"Top run IDs: {top_run_ids}")

    # Create a directory to save the top models locally
    os.makedirs(local_save_path, exist_ok=True)

    # Process each run and decide whether to save or delete its artifact
    for run in sorted_runs:
        try:
            # Find the list of artifacts associated with the current run
            artifacts = list(run.logged_artifacts())

            # Find the artifact that matches the specified artifact_name
            artifact = next((a for a in artifacts if artifact_name in a.name), None)

            if artifact is None:
                raise ValueError(f"No artifact named {artifact_name} found for run {run.name}")

            if run.id in top_run_ids:
                # Download and save the model locally if it's in the top X
                artifact_dir = artifact.download()
                local_model_path = os.path.join(local_save_path, f"{run.name}.keras")
                os.rename(os.path.join(artifact_dir, "best_model.keras"), local_model_path)
                print(f"Saved {local_model_path} locally.")

                # Save the overall best model as `overall_best_model.keras`
                if run == best_overall_run:
                    overall_best_path = os.path.join(local_save_path, "overall_best_model.keras")
                    os.rename(local_model_path, overall_best_path)
                    print(f"Saved the best overall model as {overall_best_path}.")
            else:
                # Delete the artifact if it's not in the top X
                if delete_other:
                    artifact.delete()
                    print(f"Deleted artifact from run {run.name}.")
                else:
                    print(f"Skipping deletion of artifact from run {run.name}.")
        except Exception as e:
            print(f"Could not process artifact for run {run.name}: {e}")

    print("Completed processing the models.")

# Example usage:
# Ensure your `cfg` object has the project name and username details
cleanup_and_save_top_models(cfg.project_name, cfg.username, sweep_id=sweep_id, top_x=3)



Found 60 runs in the sweep.
Identified the top 3 runs.
Best overall run: unique-sweep-27
Top run IDs: {'958pli3r', '2zr4i4om', 'vpwipunx'}


[34m[1mwandb[0m:   1 of 1 files downloaded.  


Saved ./best_models/unique-sweep-27.keras locally.
Saved the best overall model as ./best_models/overall_best_model.keras.


[34m[1mwandb[0m:   1 of 1 files downloaded.  


Saved ./best_models/silver-sweep-15.keras locally.


[34m[1mwandb[0m:   1 of 1 files downloaded.  


Saved ./best_models/cerulean-sweep-42.keras locally.
Could not process artifact for run dulcet-sweep-60: No artifact named best-model found for run dulcet-sweep-60
Completed processing the models.


In [None]:

import pandas as pd
# Assuming 'predictions' is an array containing your model's predictions
# load model
best_model = keras.models.load_model("./best_models/overall_best_model.keras")
if best_model is not None:
    best_model.summary()
    score = best_model.evaluate(x_val, y_val)
    print("Validation loss:", score[0], "Validation accuracy:", score[1])
    predictions = best_model.predict(x_test)
    class_predictions = np.argmax(predictions, axis=1)




In [None]:
if predictions is not None:
    
    # Create a DataFrame with the prediction results
    # 'ImageId' is a common column name in MNIST-like competitions
    submission = pd.DataFrame({
        "ImageId": list(range(1, len(class_predictions) + 1)),
        "Label": class_predictions
    })

    # Save the DataFrame to a CSV file
    submission.to_csv('my_submission.csv', index=False)

In [None]:
#!kaggle competitions submit -c digit-recognizer -f my_submission.csv -m "First submission"


In [None]:
!kaggle competitions submissions digit-recognizer