# 🔧 Severstal Steel Defect Dataset Downloader

This notebook guides you through:
1. Setting up the Kaggle API
2. Downloading the Severstal Steel Defect Detection dataset
3. Unzipping the dataset and image files

⚠️ **Important**: You must have a [Kaggle](https://www.kaggle.com) account and an API token (`kaggle.json`) to use this notebook.

In [None]:
# ============================================
# ✅ Install Kaggle CLI if not already installed
# ============================================
!pip install -q kaggle

In [None]:
# ============================================
# 🔑 Upload your Kaggle API key
# ============================================
import os
from pathlib import Path

kaggle_dir = Path.home() / ".kaggle"
kaggle_dir.mkdir(exist_ok=True)

from IPython.display import display
from ipywidgets import FileUpload

upload_widget = FileUpload(accept='.json', multiple=False)
display(upload_widget)

In [None]:
# Save the uploaded file to ~/.kaggle/kaggle.json
for name, file_info in upload_widget.value.items():
    kaggle_json_path = kaggle_dir / "kaggle.json"
    kaggle_json_path.write_bytes(file_info["content"])
    os.chmod(kaggle_json_path, 0o600)
    print(f"Kaggle API key saved to {kaggle_json_path}")

In [None]:
# ============================================
# ⬇️ Download the Severstal dataset ZIP file
# ============================================
!kaggle competitions download -c severstal-steel-defect-detection

In [None]:
# ============================================
# 🗂️ Unzip main dataset archive
# ============================================
import zipfile

main_zip = "severstal-steel-defect-detection.zip"
extract_path = Path("severstal_dataset")
extract_path.mkdir(exist_ok=True)

with zipfile.ZipFile(main_zip, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Extracted dataset contents to:", extract_path)

In [None]:
# ============================================
# 🖼️ Unzip train/test image folders
# ============================================
for zip_name in ["train_images.zip", "test_images.zip"]:
    zip_path = extract_path / zip_name
    if zip_path.exists():
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            target_dir = extract_path / zip_name.replace(".zip", "")
            target_dir.mkdir(exist_ok=True)
            zip_ref.extractall(target_dir)
            print(f"Extracted {zip_name} to {target_dir}")
    else:
        print(f"{zip_name} not found.")

## ✅ Download Complete

You now have:
- `train.csv`: CSV file with encoded mask annotations
- `train_images/`: directory with ~12,000 training SEM-like images
- `test_images/`: directory with test images
- `sample_submission.csv`: template for prediction format

Next step: parse the annotations and visualize defects.

## 🖼️ Visualize Sample Defect Annotations

This cell loads a few random images from the training set and overlays defect masks based on the `train.csv` annotations. 
This helps explore what the defect labels look like, and how masks are encoded.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
from pathlib import Path

# Load CSV with RLE annotations
csv_path = Path("severstal_dataset/train.csv")
df = pd.read_csv(csv_path)

# Prepare for decoding
df[['ImageId', 'ClassId']] = df['ImageId_ClassId'].str.split('_', expand=True)
df = df.dropna(subset=['EncodedPixels'])

def rle_decode(mask_rle, shape=(1600, 256)):
    s = list(map(int, mask_rle.split()))
    starts, lengths = s[::2], s[1::2]
    starts = np.array(starts) - 1
    ends = starts + lengths
    img = np.zeros(shape[0] * shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape((shape[1], shape[0])).T  # reshape and transpose

def show_image_with_masks(image_id, base_path):
    image_path = base_path / "train_images" / image_id
    image = np.array(Image.open(image_path))
    mask = np.zeros_like(image)

    for class_id in range(1, 5):
        rle = df.loc[(df['ImageId'] == image_id) & (df['ClassId'] == str(class_id)), 'EncodedPixels']
        if not rle.empty:
            decoded_mask = rle_decode(rle.values[0])
            mask[decoded_mask == 1] = class_id * 50  # shade for visual difference

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.imshow(image, cmap='gray')
    plt.title(f"Original: {image_id}")
    plt.axis('off')
    plt.subplot(1, 2, 2)
    plt.imshow(image, cmap='gray')
    plt.imshow(mask, alpha=0.5, cmap='jet')
    plt.title("With Defect Overlay")
    plt.axis('off')
    plt.show()

# Show a few random samples
sample_ids = df['ImageId'].drop_duplicates().sample(3, random_state=42)
for img_id in sample_ids:
    show_image_with_masks(img_id, Path("severstal_dataset"))

## 📦 Convert RLE Masks to Bounding Boxes for YOLO

This cell converts each RLE mask in `train.csv` into a bounding box and saves it in YOLO format:

`class_id x_center y_center width height` (normalized coordinates)

It creates one `.txt` file per image (like YOLOv5 expects), placed in a `labels_yolo/` folder.

In [None]:
import os
from pathlib import Path
import numpy as np
import pandas as pd

# Reload and clean the dataframe
df = pd.read_csv("severstal_dataset/train.csv")
df[['ImageId', 'ClassId']] = df['ImageId_ClassId'].str.split('_', expand=True)
df = df.dropna(subset=['EncodedPixels'])

# Utility to decode RLE and get bounding box
def rle_to_bbox(rle, shape=(1600, 256)):
    s = list(map(int, rle.split()))
    starts, lengths = s[::2], s[1::2]
    starts = np.array(starts) - 1
    ends = starts + lengths
    img = np.zeros(shape[0] * shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    mask = img.reshape((shape[1], shape[0])).T
    rows = np.any(mask, axis=1)
    cols = np.any(mask, axis=0)
    if not rows.any() or not cols.any():
        return None
    y_min, y_max = np.where(rows)[0][[0, -1]]
    x_min, x_max = np.where(cols)[0][[0, -1]]
    return x_min, y_min, x_max, y_max

# Prepare output directory
label_dir = Path("severstal_dataset/labels_yolo")
label_dir.mkdir(exist_ok=True)

# Generate YOLO label files
for image_id, group in df.groupby('ImageId'):
    label_lines = []
    for _, row in group.iterrows():
        bbox = rle_to_bbox(row['EncodedPixels'])
        if bbox:
            x_min, y_min, x_max, y_max = bbox
            x_center = (x_min + x_max) / 2 / 1600
            y_center = (y_min + y_max) / 2 / 256
            width = (x_max - x_min) / 1600
            height = (y_max - y_min) / 256
            class_id = int(row['ClassId']) - 1  # YOLO expects 0-based class IDs
            label_lines.append(f"{class_id} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}")
    if label_lines:
        with open(label_dir / (image_id.replace('.jpg', '.txt')), 'w') as f:
            f.write('\n'.join(label_lines))

print(f"Done. Saved {len(os.listdir(label_dir))} label files in {label_dir}")

## ✅ Visualize YOLO Bounding Boxes

This cell loads a random `.jpg` image from the training set and overlays the corresponding bounding boxes extracted from YOLO `.txt` labels.
Useful for checking if the bounding boxes correctly map to defect regions.

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
import random
import os

def draw_yolo_boxes(image_path, label_path, image_size=(1600, 256)):
    image = Image.open(image_path)
    fig, ax = plt.subplots(figsize=(12, 6))
    ax.imshow(image, cmap='gray')

    if not os.path.exists(label_path):
        print("No label file found.")
        return

    with open(label_path, 'r') as f:
        for line in f:
            cls, x_center, y_center, width, height = map(float, line.strip().split())
            # convert to pixel coordinates
            x = (x_center - width / 2) * image_size[0]
            y = (y_center - height / 2) * image_size[1]
            w = width * image_size[0]
            h = height * image_size[1]
            rect = patches.Rectangle((x, y), w, h, linewidth=2, edgecolor='red', facecolor='none')
            ax.add_patch(rect)
            ax.text(x, y - 5, f"Class {int(cls)}", color='yellow', fontsize=10, weight='bold')

    ax.set_title(f"YOLO Bounding Boxes: {os.path.basename(image_path)}")
    ax.axis('off')
    plt.show()

# Sample one image from the labels directory
label_dir = Path("severstal_dataset/labels_yolo")
image_dir = Path("severstal_dataset/train_images")
sample_txt = random.choice(list(label_dir.glob("*.txt")))
sample_img = image_dir / sample_txt.name.replace(".txt", ".jpg")

draw_yolo_boxes(sample_img, sample_txt)

# 🛠️ Prepare Dataset for YOLO TrainingThis section prepares the Severstal SEM dataset in the folder structure required by YOLOv8.It splits the dataset into training and validation sets, organizes images and labels, and generates a `data.yaml` file.

In [None]:
import os, shutil, random
from pathlib import Path

base_dir = Path("severstal_yolo")
image_src = Path("severstal_dataset/train_images")
label_src = Path("severstal_dataset/labels_yolo")

for subset in ['train', 'val']:
    (base_dir / f"images/{subset}").mkdir(parents=True, exist_ok=True)
    (base_dir / f"labels/{subset}").mkdir(parents=True, exist_ok=True)

all_files = [f.stem for f in label_src.glob("*.txt")]
random.seed(42)
random.shuffle(all_files)
split_index = int(0.8 * len(all_files))
train_files = all_files[:split_index]
val_files = all_files[split_index:]

def move_files(file_list, subset):
    for stem in file_list:
        shutil.copy(image_src / f"{stem}.jpg", base_dir / f"images/{subset}" / f"{stem}.jpg")
        shutil.copy(label_src / f"{stem}.txt", base_dir / f"labels/{subset}" / f"{stem}.txt")

move_files(train_files, "train")
move_files(val_files, "val")
print(f"Moved {len(train_files)} train and {len(val_files)} val images/labels.")

## 📄 Generate `data.yaml` fileThis YAML file tells YOLOv8 where to find training/validation data and how many classes are used.

In [None]:
yaml_path = base_dir / "data.yaml"
with open(yaml_path, 'w') as f:
    f.write("""
path: severstal_yolo
train: images/train
val: images/val

nc: 4
names: ["defect1", "defect2", "defect3", "defect4"]
""")
print(f"Created {yaml_path}")

## 🚀 Train YOLOv8 on the Severstal DatasetNow that the data is ready, we can start training a YOLOv8 model using the Ultralytics library.

In [None]:
# !pip install ultralytics  # Uncomment if not already installed
from ultralytics import YOLO

model = YOLO("yolov8n.pt")  # Replace with yolov8s.pt for better accuracy
model.train(data=str(yaml_path), epochs=30, imgsz=640, batch=8)

## 🧪 Visualize YOLOv8 Predictions on Validation ImagesThis cell runs inference using the trained YOLOv8 model and shows bounding box predictions over validation images.

In [None]:
from ultralytics import YOLO
from pathlib import Path
import random
import matplotlib.pyplot as plt
import cv2
import numpy as np

# Load the best trained model (from previous run)
model = YOLO("runs/detect/train/weights/best.pt")

# Pick a few random validation images
val_images = list(Path("severstal_yolo/images/val").glob("*.jpg"))
sample_paths = random.sample(val_images, 3)

for img_path in sample_paths:
    results = model.predict(source=str(img_path), save=False, conf=0.25)
    boxes = results[0].boxes

    image = cv2.imread(str(img_path))
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    for box in boxes:
        x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
        cls_id = int(box.cls[0])
        conf = box.conf[0].item()
        label = f"{model.names[cls_id]} {conf:.2f}"
        cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (255, 0, 0), 2)
        cv2.putText(image, label, (int(x1), int(y1)-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 0), 1)

    plt.figure(figsize=(12, 6))
    plt.imshow(image)
    plt.title(f"Predictions on: {img_path.name}")
    plt.axis('off')
    plt.show()