In [None]:
# Imports (exactly as in your notebook)
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from torchvision.models import resnet18, ResNet18_Weights
import os

# Your exact resize_images function
def resize_images(input_folder, output_folder, image_list, size=(224, 224)):
    os.makedirs(output_folder, exist_ok=True)
    for image_id in image_list:
        input_path = os.path.join(input_folder, image_id)
        output_path = os.path.join(output_folder, image_id)
        try:
            with Image.open(input_path) as img:
                img_resized = img.resize(size, Image.LANCZOS).convert('RGB')
                img_resized.save(output_path)
        except Exception as e:
            print(f"Failed to process {image_id}: {e}")
    print(f"{len(os.listdir(output_folder))} {input_folder} images resized successfully!")

# Your exact SoilDataset class
class SoilDataset(Dataset):
    def __init__(self, df, image_dir, transform=None):
        self.df = df.reset_index(drop=True)
        self.image_dir = image_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_id = self.df.loc[idx, "image_id"]
        label = self.df.loc[idx, "label"]
        image_path = os.path.join(self.image_dir, img_id)
        image = Image.open(image_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image, label

# Data loading and preparation (exactly as in your notebook)
df = pd.read_csv("/kaggle/input/soil-classification/soil_classification-2025/train_labels.csv")
le = LabelEncoder()
df["label"] = le.fit_transform(df["soil_type"])

# Split data
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

# Image resizing
resize_images("/kaggle/input/soil-classification/soil_classification-2025/train", 'train_resized', train_df['image_id'].tolist())

# Transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                         std=[0.229, 0.224, 0.225])
])

# Data loaders
train_dataset = SoilDataset(train_df, "train_resized", transform)
val_dataset = SoilDataset(val_df, "train_resized", transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2)

# Model setup (exactly as in your notebook)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = resnet18(weights=ResNet18_Weights.DEFAULT)
model.fc = nn.Linear(model.fc.in_features, 4)  # 4 soil types
model = model.to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Training loop (exactly as in your notebook)
epochs = 10
for epoch in range(epochs):
    model.train()
    running_loss = 0
    correct = 0
    total = 0

    for images, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()

        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

    train_acc = correct / total
    print(f"Train Loss: {running_loss:.4f}, Accuracy: {train_acc:.4f}")

    # Validation
    model.eval()
    val_preds = []
    val_targets = []
    
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            val_preds.extend(predicted.cpu().numpy())
            val_targets.extend(labels.cpu().numpy())
    
    val_acc = sum([p == t for p, t in zip(val_preds, val_targets)]) / len(val_targets)
    val_f1 = f1_score(val_targets, val_preds, average='macro')
    
    print(f"Validation Accuracy: {val_acc:.4f}, F1 Score (macro): {val_f1:.4f}")

# Save model
torch.save(model.state_dict(), "resnet_soil_model.pth")

# Label mapping
print(dict(zip(le.classes_, le.transform(le.classes_))))