In [None]:
# All code related to adding non-soil images
!pip install bing-image-downloader

import shutil
import os
import pandas as pd
import numpy as np
from PIL import Image
from bing_image_downloader import downloader
from tensorflow.keras.datasets import mnist

# Copy original dataset to /kaggle/working
source_dir = "/kaggle/input/soil-classification-part-2/soil_competition-2025"
working_dir = "/kaggle/working"
train_dir = os.path.join(working_dir, "train")

# Copy images and CSV
shutil.copytree(os.path.join(source_dir, "train"), train_dir, dirs_exist_ok=True)
shutil.copy(os.path.join(source_dir, "train_labels.csv"), os.path.join(working_dir, "train_labels.csv"))
print("Original soil dataset copied to /kaggle/working")

# Download non-soil images from Bing
queries = ["car", "building", "city street", "forest", "ocean",
           "snow", "road", "sky", "dog", "flower"]

non_soil_root = "non_soil_images"

for query in queries:
    downloader.download(query, limit=50, output_dir=non_soil_root,
                      adult_filter_off=True, force_replace=False, timeout=60)
print("Downloaded non-soil images from Bing")

# Download non-soil images from MNIST
mnist_dir = "mnist"
os.makedirs(mnist_dir, exist_ok=True)
IMAGES_COUNT = 500

(x_train, _), (x_test, _) = mnist.load_data()
mnist_images = np.concatenate([x_train, x_test], axis=0)[:IMAGES_COUNT]

def preprocess_img(img):
    return Image.fromarray(img).resize((224, 224)).convert("RGB")

for i, img_array in enumerate(mnist_images):
    img = preprocess_img(img_array)
    img.save(os.path.join(mnist_dir, f"mnist_{i:03d}.png"))
print(f"Saved {IMAGES_COUNT} MNIST images")

# Move all extra images to train/
for file in os.listdir(mnist_dir):
    shutil.move(os.path.join(mnist_dir, file), os.path.join(train_dir, file))

for folder in os.listdir(non_soil_root):
    full_folder = os.path.join(non_soil_root, folder)
    for file in os.listdir(full_folder):
        shutil.move(os.path.join(full_folder, file), os.path.join(train_dir, f"{folder}_{file}"))
print("Moved MNIST & Bing images to /train")

# Cleanup
shutil.rmtree("non_soil_images", ignore_errors=True)
shutil.rmtree("mnist", ignore_errors=True)
print("🧹 Cleaned up temporary folders.")

# Update train_labels.csv with label=0 for new images
labels_csv_path = os.path.join(working_dir, "train_labels.csv")
df = pd.read_csv(labels_csv_path)
existing_images = set(df["image_id"])

# MNIST
mnist_files = [f for f in os.listdir(train_dir) if f.startswith("mnist_")]
mnist_df = pd.DataFrame([{"image_id": f, "label": 0} for f in mnist_files if f not in existing_images])

# Bing
bing_files = [f for f in os.listdir(train_dir)
             if f.lower().endswith((".jpg", ".jpeg", ".png")) and
             f not in existing_images and not f.startswith("mnist_")]
bing_df = pd.DataFrame([{"image_id": f, "label": 0} for f in bing_files])

# Final CSV
df_updated = pd.concat([df, mnist_df, bing_df], ignore_index=True)
df_updated.to_csv(labels_csv_path, index=False)
print(f"train_labels.csv updated with {len(mnist_df)} MNIST and {len(bing_df)} Bing images")