## Setup Nvidia Toolkit and Install nvtop

``` bash
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
  && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
    sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
    sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
```
``` bash
sudo apt update
sudo apt-get install -y nvidia-container-toolkit
sudo nvidia-ctk runtime configure --runtime=docker
sudo jq 'if has("exec-opts") then . else . + {"exec-opts": ["native.cgroupdriver=cgroupfs"]} end' \
  /etc/docker/daemon.json | sudo tee /etc/docker/daemon.json.tmp > /dev/null && \
  sudo mv /etc/docker/daemon.json.tmp /etc/docker/daemon.json
sudo systemctl restart docker
docker run --rm --gpus all ubuntu nvidia-smi
```

``` bash
sudo apt update 
sudo apt -y install nvtop
```

## Store All the keys and endpoint on start in the Terminal to use in Future

``` bash
# Run in node terminal
export AWS_ACCESS_KEY_ID=<your-key>
export AWS_SECRET_ACCESS_KEY=<your-seckret>
export MINIO_ENDPOINT=<minio-endpoint>
export MLFLOW_TRACKING_URI=<mlflow-endpoint>
export RAY_ADDRESS=<ray-endpoint>
```

``` bash
# OPTIONAL - If Planning to use Jupyter Notebook to do testing and training
docker build -t jupyter-train -f MLOps/model_train/docker/Dockerfile.jupyter_cuda .

# OPTIONAL - If Planning to host minio, mlflow, postgres in local
docker compose -f MLOps/model_train/docker/docker-compose-infra.yaml up -d

# COMPULSORY - Start Ray and FAST-API server
docker compose -f MLOps/model_train/docker/docker-compose-model-train-setup.yaml up -d
```

### If Jupyter-train is built and is required to Run- execute below [OPTIONAL]

``` bash
docker run  -d --rm  -p 8888:8888 \
    --gpus all \
    --shm-size 16G \
    -v ~/MLOps:/home/jovyan/work/ \
    -v features_data:/home/jovyan/data/ \
    -e MLFLOW_TRACKING_URI=${MLFLOW_TRACKING_URI} \
    -e AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} \
    -e AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} \
    -e RAY_ADDRESS=${RAY_ADDRESS} \
    -e AWS_ENDPOINT_URL=${MINIO_ENDPOINT} \
    -e BIRDCLEF_BASE_DIR=/home/jovyan/data/ \
    --mount type=bind,source=/mnt/object,target=/mnt/birdclef,readonly \
    --name jupyter \
    jupyter-train
```

``` bash
docker logs jupyter
```

### Sample Ray Train scheduling commands via jupyter-train

``` bash
# Run on Jupyter Notebook launched in docker
ray job submit --runtime-env runtime.json --entrypoint-num-gpus 0.25 --entrypoint-num-cpus 8 --verbose --no-wait --working-dir . -- python trainPannsEmb.py --epochs 1
ray job submit --runtime-env runtime.json --entrypoint-num-gpus 0.75 --entrypoint-num-cpus 16 --verbose --no-wait --working-dir . -- python trainResNet50.py --epochs 1
ray job submit --runtime-env runtime.json --entrypoint-num-gpus 1 --entrypoint-num-cpus 16 --verbose --no-wait --working-dir . -- python trainEffNetB3.py --epochs 1
ray job submit --runtime-env runtime.json --entrypoint-num-gpus 1 --entrypoint-num-cpus 16 --verbose --no-wait --working-dir . -- python trainRawCNN.py --epochs 1
ray job submit --runtime-env runtime.json --entrypoint-num-gpus 0.25 --entrypoint-num-cpus 8 --verbose --no-wait --working-dir . -- python hyperParameterTunePannsEmb.py --epochs 3 --num_samples 5
ray job submit --runtime-env runtime.json --entrypoint-num-gpus 0.25 --entrypoint-num-cpus 8 --verbose --no-wait --working-dir . -- python hyperParameterTuneEffNetB3.py --epochs 3 --num_samples 5
```

``` bash
docker run  -d --rm  -p 8888:8888 \
    --gpus all \
    --shm-size 16G \
    -v ~/MLOps:/home/jovyan/work/ \
    -e MLFLOW_TRACKING_URI=http://129.114.26.77:8000/ \
    -e FOOD11_DATA_DIR=/mnt/Food-11 \
    --mount type=bind,source=/mnt/object,target=/mnt/birdclef,readonly \
    --name jupyter \
    jupyter-mlflow
```

# [ADDITIONAL] - Keep Track of Experiment training status from bash

## Pool MLFLOW

``` bash
RUN_ID="2767d8907bdf4b48a22909b525677f76"

# terminal states to stop polling on
TERMINAL="FINISHED|FAILED|KILLED"

while true; do
  status=$(curl -s \
    "${MLFLOW_TRACKING_URI}/api/2.0/mlflow/runs/get?run_id=${RUN_ID}" \
    | jq -r '.run.info.status')

  echo "$(date +%T) → run ${RUN_ID} status: ${status}"

  if [[ "$status" =~ ^(${TERMINAL})$ ]]; then
    echo "Run reached terminal state: $status"
    break
  fi

  sleep 10
done
```

## Pool FastAPI- server with Ray JobID

``` bash
JOB_ID="$1"
STATUS_URL="http://192.5.87.49:9090/status?job_id=${JOB_ID}"

echo "Tracking Ray job ${JOB_ID} at ${STATUS_URL}"

while true; do
  # Fetch status JSON
  RESP=$(curl -s "$STATUS_URL")
  STATUS=$(echo "$RESP" | jq -r .status)

  printf "%s → job %s status: %s\n" "$(date +'%T')" "$JOB_ID" "$STATUS"

  if [ "$STATUS" = "SUCCEEDED" ]; then
    # Extract MLflow run ID if present
    MLFLOW_ID=$(echo "$RESP" | jq -r '.mlflow_run_id // empty')
    if [ -n "$MLFLOW_ID" ]; then
      echo "Job succeeded. MLflow run ID: $MLFLOW_ID"
    else
      echo "Job succeeded. (No MLflow run ID found in response.)"
    fi
    exit 0

  elif [[ "$STATUS" =~ ^(FAILED|STOPPED)$ ]]; then
    echo "Job ${JOB_ID} ended with status: $STATUS"
    exit 1
  fi

  sleep 10
done
```