# using a small CNN for the dataset 

In [1]:
import os
import numpy as np
import pandas as pd
import cv2
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, precision_recall_curve
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping

2025-05-25 16:54:11.887477: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748192052.087128      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748192052.155533      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# --- Paths ---
csv_path = '/kaggle/input/soil-classification-part-2/soil_competition-2025/train_labels.csv'
train_image_dir = '/kaggle/input/soil-classification-part-2/soil_competition-2025/train'
test_image_dir = '/kaggle/input/soil-classification-part-2/soil_competition-2025/test'
img_size = (299, 299)


In [3]:
# --- Load training data ---
df = pd.read_csv(csv_path)
X = []
y = []

In [4]:
for idx, row in df.iterrows():
    img_name = row['image_id']
    label = row['label']  # All 1's
    img_path = os.path.join(train_image_dir, img_name)
    if os.path.exists(img_path):
        img = cv2.imread(img_path)
        img = cv2.resize(img, img_size)
        X.append(img)
        y.append(label)

X = np.array(X) / 255.0
y = np.array(y)

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [6]:
# --- Data augmentation ---
datagen = ImageDataGenerator(
    rotation_range=20,
    zoom_range=0.2,
    horizontal_flip=True,
    width_shift_range=0.2,
    height_shift_range=0.2
)
datagen.fit(X_train)
val_datagen = ImageDataGenerator(rescale=1./255)


In [7]:
# --- Build CNN ---
model = Sequential([
    Conv2D(32, (3,3), activation='relu', input_shape=(299, 299, 3)),
    MaxPooling2D(2,2),
    Dropout(0.3),
    BatchNormalization(),

    Conv2D(64, (3,3), activation='relu'),
    MaxPooling2D(2,2),
    Dropout(0.3),
    BatchNormalization(),

    Conv2D(128, (3,3), activation='relu'),
    MaxPooling2D(2,2),
    Dropout(0.3),
    BatchNormalization(),

    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.4),
    Dense(1, activation='sigmoid')  # Binary classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1748192123.968893      35 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


In [8]:
# --- Load test images ---
test_images = []
test_filenames = []

for fname in os.listdir(test_image_dir):
    if fname.lower().endswith(('.jpg', '.jpeg', '.png')):
        img_path = os.path.join(test_image_dir, fname)
        img = cv2.imread(img_path)
        img = cv2.resize(img, img_size)
        test_images.append(img)
        test_filenames.append(fname)

X_test = np.array(test_images) / 255.0


In [9]:
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

model.fit(datagen.flow(X_train, y_train, batch_size=32),
          validation_data=(X_val, y_val),
          epochs=10,
          callbacks=[early_stop])

  self._warn_if_super_not_called()


Epoch 1/10


I0000 00:00:1748192148.101272     101 service.cc:148] XLA service 0x7e8ad4014a60 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1748192148.102383     101 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1748192148.590597     101 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m 2/35[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 60ms/step - accuracy: 0.5469 - loss: 2.3356 

I0000 00:00:1748192156.159689     101 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 769ms/step - accuracy: 0.8866 - loss: 0.8786 - val_accuracy: 1.0000 - val_loss: 1.7004e-28
Epoch 2/10
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 548ms/step - accuracy: 0.9940 - loss: 0.0703 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 3/10
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 550ms/step - accuracy: 0.9994 - loss: 0.0020 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 4/10
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 549ms/step - accuracy: 0.9991 - loss: 0.0214 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 5/10
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 549ms/step - accuracy: 0.9982 - loss: 0.0651 - val_accuracy: 1.0000 - val_loss: 0.0000e+00


<keras.src.callbacks.history.History at 0x7e8d32b5a550>

In [10]:

# --- Predict probabilities ---
y_probs = model.predict(X_test).flatten()

# --- Threshold tuning using validation data ---
y_val_probs = model.predict(X_val).flatten()
prec, rec, thresh = precision_recall_curve(y_val, y_val_probs)
f1_scores = 2 * prec * rec / (prec + rec + 1e-8)
best_threshold = thresh[np.argmax(f1_scores)]
print(f"🔍 Best threshold: {best_threshold:.2f}")

# --- Apply best threshold to test predictions ---
y_pred = (y_probs >= best_threshold).astype(int)

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 61ms/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step
🔍 Best threshold: 1.00
