In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [5]:
import os
import uuid
import math
import json
import warnings
import numpy as np
from PIL import Image
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Flatten, Dense, BatchNormalization, Dropout, LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import HeNormal
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping, TerminateOnNaN

# -----------------------------
# SETUP
# -----------------------------
IMAGES_PATH = '/kaggle/input/xview-recognition'  # Update to Kaggle dataset path
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# -----------------------------
# Categories
# -----------------------------
categories = {
    0: 'Cargo plane', 1: 'Small car', 2: 'Bus', 3: 'Truck',
    4: 'Motorboat', 5: 'Fishing vessel', 6: 'Dump truck', 7: 'Excavator',
    8: 'Building', 9: 'Helipad', 10: 'Storage tank', 11: 'Shipping container',
    12: 'Pylon'
}
NUM_CATEGORIES = len(categories)
category_to_index = {v: k for k, v in categories.items()}

# -----------------------------
# Data classes
# -----------------------------
class GenericObject:
    def __init__(self):
        self.id = uuid.uuid4()
        self.bb = (-1, -1, -1, -1)
        self.category = -1
        self.score = -1

class GenericImage:
    def __init__(self, filename):
        self.filename = filename
        self.tile = np.array([-1, -1, -1, -1])
        self.objects = []

    def add_object(self, obj: GenericObject):
        self.objects.append(obj)

# -----------------------------
# Load images with PIL
# -----------------------------
def load_geoimage(filename, target_size=(64,64)):
    full_path = os.path.join(IMAGES_PATH, filename)
    img = Image.open(full_path).convert("RGB")  # ensures 3 channels
    img = img.resize(target_size)
    img_array = np.array(img).astype(np.float32) / 255.0
    return img_array

# -----------------------------
# Load annotations
# -----------------------------
json_file = os.path.join(IMAGES_PATH, 'xview_ann_train.json')
with open(json_file) as ifs:
    json_data = json.load(ifs)

anns_dataset = []
for json_img, json_ann in zip(json_data['images'].values(), json_data['annotations'].values()):
    image = GenericImage(json_img['filename'])
    obj = GenericObject()
    obj.bb = tuple(map(int, json_ann['bbox']))
    obj.category = json_ann['category_id']
    image.add_object(obj)
    anns_dataset.append(image)

# -----------------------------
# Split dataset
# -----------------------------
anns_train, anns_valid = train_test_split(
    anns_dataset, test_size=0.1, random_state=RANDOM_SEED, shuffle=True
)
print('Training images:', len(anns_train), 'Validation images:', len(anns_valid))

# Flatten annotations
objs_train = [(ann.filename, obj) for ann in anns_train for obj in ann.objects]
objs_valid = [(ann.filename, obj) for ann in anns_valid for obj in ann.objects]

# -----------------------------
# Compute class weights
# -----------------------------
def compute_class_weights(objs):
    counts = np.zeros(NUM_CATEGORIES, dtype=np.int64)
    for _, obj in objs:
        if isinstance(obj.category, str):
            cat_idx = category_to_index[obj.category]
        else:
            cat_idx = int(obj.category)
        counts[cat_idx] += 1
    counts = np.maximum(counts, 1)
    class_weights = {i: float(np.sum(counts)) / (len(counts) * counts[i]) for i in range(len(counts))}
    return class_weights, counts

class_weights, class_counts = compute_class_weights(objs_train)
print("Class counts:", class_counts)
print("Class weights:", class_weights)

# -----------------------------
# Generator (memory safe)
# -----------------------------
DOWNSAMPLE_SIZE = (64, 64)
BATCH_SIZE = 8

def generator_images(objs, batch_size=BATCH_SIZE, do_shuffle=False):
    while True:
        if do_shuffle:
            np.random.shuffle(objs)
        for i in range(0, len(objs), batch_size):
            group = objs[i:i+batch_size]
            images, labels, sample_weights = [], [], []
            for filename, obj in group:
                img = load_geoimage(filename, target_size=DOWNSAMPLE_SIZE)
                images.append(img)

                # One-hot label
                prob = np.zeros(NUM_CATEGORIES, dtype=np.float32)
                cat_idx = category_to_index[obj.category] if isinstance(obj.category, str) else int(obj.category)
                prob[cat_idx] = 1.0
                labels.append(prob)

                # Sample weight
                sample_weights.append(float(class_weights[cat_idx]))

            images = tf.convert_to_tensor(np.stack(images), dtype=tf.float32)
            labels = tf.convert_to_tensor(np.stack(labels), dtype=tf.float32)
            sample_weights = tf.convert_to_tensor(np.stack(sample_weights), dtype=tf.float32)
            yield images, labels, sample_weights

train_generator = generator_images(objs_train, do_shuffle=True)
valid_generator = generator_images(objs_valid, do_shuffle=False)

# -----------------------------
# FFNN Model
# -----------------------------
model = Sequential([
    Input(shape=(DOWNSAMPLE_SIZE[0], DOWNSAMPLE_SIZE[1], 3)),
    Flatten(),
    Dense(128, kernel_initializer=HeNormal()),
    BatchNormalization(),
    LeakyReLU(0.1),
    Dropout(0.3),
    Dense(64, kernel_initializer=HeNormal()),
    BatchNormalization(),
    LeakyReLU(0.1),
    Dropout(0.2),
    Dense(NUM_CATEGORIES, activation='softmax')
])
model.summary()

# -----------------------------
# Compile
# -----------------------------
opt = Adam(learning_rate=1e-3)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

# -----------------------------
# Callbacks
# -----------------------------
callbacks = [
    ModelCheckpoint('model.keras', monitor='val_accuracy', save_best_only=True, verbose=1),
    ReduceLROnPlateau('val_accuracy', factor=0.1, patience=5, verbose=1),
    EarlyStopping('val_accuracy', patience=10, verbose=1),
    TerminateOnNaN()
]

# -----------------------------
# Training
# -----------------------------
EPOCHS = 10
train_steps = math.ceil(len(objs_train)/BATCH_SIZE)
valid_steps = math.ceil(len(objs_valid)/BATCH_SIZE)

history = model.fit(
    train_generator,
    steps_per_epoch=train_steps,
    validation_data=valid_generator,
    validation_steps=valid_steps,
    epochs=EPOCHS,
    callbacks=callbacks,
    verbose=1
)

best_idx = int(np.argmax(history.history.get('val_accuracy', [0])))
best_value = np.max(history.history.get('val_accuracy', [0]))
print(f'Best validation model: epoch {best_idx+1} - val_accuracy {best_value:.4f}')


2025-10-03 18:50:08.367204: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759517408.549025      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759517408.603625      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Training images: 16871 Validation images: 1875
Class counts: [ 592 2973 1557 1981  963  632 1118  726 3248  103 1320 1386  272]
Class weights: {0: 2.1921777546777546, 1: 0.4365184092732024, 2: 0.8335062496912208, 3: 0.6551081427406515, 4: 1.3476315999680486, 5: 2.0534323271665045, 6: 1.1607953763588825, 7: 1.7875609239245602, 8: 0.399559492231906, 9: 12.599701269604182, 10: 0.9831585081585081, 11: 0.9363414363414363, 12: 4.771210407239819}


I0000 00:00:1759517421.405157      36 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1759517421.405993      36 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


Epoch 1/10


I0000 00:00:1759517425.688310     119 service.cc:148] XLA service 0x7f7e5c004520 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1759517425.689150     119 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1759517425.689172     119 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1759517425.986688     119 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m   4/2109[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:33[0m 45ms/step - accuracy: 0.2448 - loss: 2.4473

I0000 00:00:1759517427.763154     119 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m2109/2109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - accuracy: 0.1807 - loss: 2.3676
Epoch 1: val_accuracy improved from -inf to 0.26987, saving model to model.keras
[1m2109/2109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 77ms/step - accuracy: 0.1807 - loss: 2.3675 - val_accuracy: 0.2699 - val_loss: 1.9157 - learning_rate: 0.0010
Epoch 2/10
[1m2109/2109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.2423 - loss: 2.0159
Epoch 2: val_accuracy improved from 0.26987 to 0.30293, saving model to model.keras
[1m2109/2109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 33ms/step - accuracy: 0.2423 - loss: 2.0159 - val_accuracy: 0.3029 - val_loss: 1.8434 - learning_rate: 0.0010
Epoch 3/10
[1m2108/2109[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 28ms/step - accuracy: 0.2611 - loss: 1.9342
Epoch 3: val_accuracy did not improve from 0.30293
[1m2109/2109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0

The above ffNN is still pretty shitty - in fact more shitty than model 0! best accuracy = 0.37 (M0 was 0.39).

What we did next was:
- Reduce input size for faster training and fewer parameters (32×32)

- Wider and deeper FFNN layers to capture more pixel interactions

- Slightly more aggressive Dropout and BatchNorm

- Lower learning rate for stability

- More epochs with EarlyStopping

- Retain class weights

In [6]:
import os
import uuid
import math
import json
import warnings
import numpy as np
from PIL import Image
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Flatten, Dense, BatchNormalization, Dropout, LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import HeNormal
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping, TerminateOnNaN

# -----------------------------
# SETUP
# -----------------------------
IMAGES_PATH = '/kaggle/input/xview-recognition'  # Kaggle dataset path
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# -----------------------------
# Categories
# -----------------------------
categories = {
    0: 'Cargo plane', 1: 'Small car', 2: 'Bus', 3: 'Truck',
    4: 'Motorboat', 5: 'Fishing vessel', 6: 'Dump truck', 7: 'Excavator',
    8: 'Building', 9: 'Helipad', 10: 'Storage tank', 11: 'Shipping container',
    12: 'Pylon'
}
NUM_CATEGORIES = len(categories)
category_to_index = {v: k for k, v in categories.items()}

# -----------------------------
# Data classes
# -----------------------------
class GenericObject:
    def __init__(self):
        self.id = uuid.uuid4()
        self.bb = (-1, -1, -1, -1)
        self.category = -1
        self.score = -1

class GenericImage:
    def __init__(self, filename):
        self.filename = filename
        self.tile = np.array([-1, -1, -1, -1])
        self.objects = []

    def add_object(self, obj: GenericObject):
        self.objects.append(obj)

# -----------------------------
# Load images with PIL
# -----------------------------
def load_geoimage(filename, target_size=(32,32)):
    full_path = os.path.join(IMAGES_PATH, filename)
    img = Image.open(full_path).convert("RGB")  # ensures 3 channels
    img = img.resize(target_size)
    img_array = np.array(img).astype(np.float32) / 255.0
    return img_array

# -----------------------------
# Load annotations
# -----------------------------
json_file = os.path.join(IMAGES_PATH, 'xview_ann_train.json')
with open(json_file) as ifs:
    json_data = json.load(ifs)

anns_dataset = []
for json_img, json_ann in zip(json_data['images'].values(), json_data['annotations'].values()):
    image = GenericImage(json_img['filename'])
    obj = GenericObject()
    obj.bb = tuple(map(int, json_ann['bbox']))
    obj.category = json_ann['category_id']
    image.add_object(obj)
    anns_dataset.append(image)

# -----------------------------
# Split dataset
# -----------------------------
anns_train, anns_valid = train_test_split(
    anns_dataset, test_size=0.1, random_state=RANDOM_SEED, shuffle=True
)
print('Training images:', len(anns_train), 'Validation images:', len(anns_valid))

# Flatten annotations
objs_train = [(ann.filename, obj) for ann in anns_train for obj in ann.objects]
objs_valid = [(ann.filename, obj) for ann in anns_valid for obj in ann.objects]

# -----------------------------
# Compute class weights
# -----------------------------
def compute_class_weights(objs):
    counts = np.zeros(NUM_CATEGORIES, dtype=np.int64)
    for _, obj in objs:
        cat_idx = category_to_index[obj.category] if isinstance(obj.category, str) else int(obj.category)
        counts[cat_idx] += 1
    counts = np.maximum(counts, 1)
    class_weights = {i: float(np.sum(counts)) / (len(counts) * counts[i]) for i in range(len(counts))}
    return class_weights, counts

class_weights, class_counts = compute_class_weights(objs_train)
print("Class counts:", class_counts)
print("Class weights:", class_weights)

# -----------------------------
# Generator (memory safe)
# -----------------------------
DOWNSAMPLE_SIZE = (32, 32)
BATCH_SIZE = 8

def generator_images(objs, batch_size=BATCH_SIZE, do_shuffle=False):
    while True:
        if do_shuffle:
            np.random.shuffle(objs)
        for i in range(0, len(objs), batch_size):
            group = objs[i:i+batch_size]
            images, labels, sample_weights = [], [], []
            for filename, obj in group:
                img = load_geoimage(filename, target_size=DOWNSAMPLE_SIZE)
                images.append(img)

                prob = np.zeros(NUM_CATEGORIES, dtype=np.float32)
                cat_idx = category_to_index[obj.category] if isinstance(obj.category, str) else int(obj.category)
                prob[cat_idx] = 1.0
                labels.append(prob)

                sample_weights.append(float(class_weights[cat_idx]))

            images = tf.convert_to_tensor(np.stack(images), dtype=tf.float32)
            labels = tf.convert_to_tensor(np.stack(labels), dtype=tf.float32)
            sample_weights = tf.convert_to_tensor(np.stack(sample_weights), dtype=tf.float32)
            yield images, labels, sample_weights

train_generator = generator_images(objs_train, do_shuffle=True)
valid_generator = generator_images(objs_valid, do_shuffle=False)

# -----------------------------
# FFNN Model (deeper & wider)
# -----------------------------
model = Sequential([
    Input(shape=(DOWNSAMPLE_SIZE[0], DOWNSAMPLE_SIZE[1], 3)),
    Flatten(),

    Dense(512, kernel_initializer=HeNormal()),
    BatchNormalization(),
    LeakyReLU(0.1),
    Dropout(0.4),

    Dense(256, kernel_initializer=HeNormal()),
    BatchNormalization(),
    LeakyReLU(0.1),
    Dropout(0.3),

    Dense(128, kernel_initializer=HeNormal()),
    BatchNormalization(),
    LeakyReLU(0.1),
    Dropout(0.2),

    Dense(NUM_CATEGORIES, activation='softmax')
])
model.summary()

# -----------------------------
# Compile
# -----------------------------
opt = Adam(learning_rate=1e-4)  # lower LR for stability
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

# -----------------------------
# Callbacks
# -----------------------------
callbacks = [
    ModelCheckpoint('model.keras', monitor='val_accuracy', save_best_only=True, verbose=1),
    ReduceLROnPlateau('val_accuracy', factor=0.1, patience=5, verbose=1),
    EarlyStopping('val_accuracy', patience=15, verbose=1),
    TerminateOnNaN()
]

# -----------------------------
# Training
# -----------------------------
EPOCHS = 30  # more epochs for FFNN
train_steps = math.ceil(len(objs_train)/BATCH_SIZE)
valid_steps = math.ceil(len(objs_valid)/BATCH_SIZE)

history = model.fit(
    train_generator,
    steps_per_epoch=train_steps,
    validation_data=valid_generator,
    validation_steps=valid_steps,
    epochs=EPOCHS,
    callbacks=callbacks,
    verbose=1
)

best_idx = int(np.argmax(history.history.get('val_accuracy', [0])))
best_value = np.max(history.history.get('val_accuracy', [0]))
print(f'Best validation model: epoch {best_idx+1} - val_accuracy {best_value:.4f}')

Training images: 16871 Validation images: 1875
Class counts: [ 592 2973 1557 1981  963  632 1118  726 3248  103 1320 1386  272]
Class weights: {0: 2.1921777546777546, 1: 0.4365184092732024, 2: 0.8335062496912208, 3: 0.6551081427406515, 4: 1.3476315999680486, 5: 2.0534323271665045, 6: 1.1607953763588825, 7: 1.7875609239245602, 8: 0.399559492231906, 9: 12.599701269604182, 10: 0.9831585081585081, 11: 0.9363414363414363, 12: 4.771210407239819}


Epoch 1/30
[1m2109/2109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.1235 - loss: 2.5868
Epoch 1: val_accuracy improved from -inf to 0.26080, saving model to model.keras
[1m2109/2109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 41ms/step - accuracy: 0.1235 - loss: 2.5868 - val_accuracy: 0.2608 - val_loss: 1.9348 - learning_rate: 1.0000e-04
Epoch 2/30
[1m2108/2109[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 26ms/step - accuracy: 0.2078 - loss: 2.2069
Epoch 2: val_accuracy improved from 0.26080 to 0.31040, saving model to model.keras
[1m2109/2109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 29ms/step - accuracy: 0.2078 - loss: 2.2069 - val_accuracy: 0.3104 - val_loss: 1.8200 - learning_rate: 1.0000e-04
Epoch 3/30
[1m2107/2109[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 26ms/step - accuracy: 0.2351 - loss: 2.1028
Epoch 3: val_accuracy did not improve from 0.31040
[1m2109/2109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

Still the model above gives us 0.37 accuracy in validation - which is barely above the expectation by chabce for a 13-class problem.
Next thing we did:
- Grayscale downsampled images → reduces input dimensionality.
- Color histograms → adds simple handcrafted features (optional, but helps FFNN learn better).
- Brightness and contrast augmentation → basic data augmentation.
- Wider layers → 1024 → 512 → 256 → 128 → 13.
- Class weights + oversampling → keep balanced learning.
- Increase patience because clearly ffNNs struggle with images (especially raw pixels as we saw above)

In [9]:
import os
import uuid
import math
import json
import numpy as np
from PIL import Image, ImageEnhance
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import HeNormal
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping, TerminateOnNaN

# -----------------------------
# SETUP
# -----------------------------
IMAGES_PATH = '/kaggle/input/xview-recognition'
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# -----------------------------
# Categories
# -----------------------------
categories = {
    0: 'Cargo plane', 1: 'Small car', 2: 'Bus', 3: 'Truck',
    4: 'Motorboat', 5: 'Fishing vessel', 6: 'Dump truck', 7: 'Excavator',
    8: 'Building', 9: 'Helipad', 10: 'Storage tank', 11: 'Shipping container',
    12: 'Pylon'
}
NUM_CATEGORIES = len(categories)
category_to_index = {v: k for k, v in categories.items()}

# -----------------------------
# Data classes
# -----------------------------
class GenericObject:
    def __init__(self):
        self.id = uuid.uuid4()
        self.bb = (-1, -1, -1, -1)
        self.category = -1
        self.score = -1

class GenericImage:
    def __init__(self, filename):
        self.filename = filename
        self.tile = np.array([-1, -1, -1, -1])
        self.objects = []

    def add_object(self, obj: GenericObject):
        self.objects.append(obj)

# -----------------------------
# Load images (grayscale + augment)
# -----------------------------
def load_image_features(filename, target_size=(32,32), augment=False):
    full_path = os.path.join(IMAGES_PATH, filename)
    img = Image.open(full_path).convert("RGB")
    
    if augment:
        img = ImageEnhance.Brightness(img).enhance(np.random.uniform(0.8, 1.2))
        img = ImageEnhance.Contrast(img).enhance(np.random.uniform(0.8, 1.2))
    
    img = img.convert("L")
    img = img.resize(target_size)
    img_array = np.array(img).astype(np.float32) / 255.0
    img_array = img_array.flatten()
    
    # Optional histogram (8 bins)
    hist = np.histogram(img_array, bins=8, range=(0,1))[0].astype(np.float32)
    hist /= np.sum(hist)
    
    features = np.concatenate([img_array, hist])
    return features

# -----------------------------
# Load annotations
# -----------------------------
json_file = os.path.join(IMAGES_PATH, 'xview_ann_train.json')
with open(json_file) as ifs:
    json_data = json.load(ifs)

anns_dataset = []
for json_img, json_ann in zip(json_data['images'].values(), json_data['annotations'].values()):
    image = GenericImage(json_img['filename'])
    obj = GenericObject()
    obj.bb = tuple(map(int, json_ann['bbox']))
    obj.category = json_ann['category_id']
    
    # Ensure string categories are mapped to int
    if isinstance(obj.category, str):
        obj.category = category_to_index[obj.category]
    
    image.add_object(obj)
    anns_dataset.append(image)

# -----------------------------
# Split dataset
# -----------------------------
anns_train, anns_valid = train_test_split(
    anns_dataset, test_size=0.1, random_state=RANDOM_SEED, shuffle=True
)
print('Training images:', len(anns_train), 'Validation images:', len(anns_valid))

objs_train = [(ann.filename, obj) for ann in anns_train for obj in ann.objects]
objs_valid = [(ann.filename, obj) for ann in anns_valid for obj in ann.objects]

# -----------------------------
# Compute class weights
# -----------------------------
def compute_class_weights(objs):
    counts = np.zeros(NUM_CATEGORIES, dtype=np.int64)
    for _, obj in objs:
        cat_idx = obj.category if isinstance(obj.category, int) else category_to_index[obj.category]
        counts[cat_idx] += 1
    counts = np.maximum(counts, 1)
    class_weights = {i: float(np.sum(counts)) / (len(counts) * counts[i]) for i in range(len(counts))}
    return class_weights, counts

class_weights, class_counts = compute_class_weights(objs_train)
print("Class counts:", class_counts)
print("Class weights:", class_weights)

# -----------------------------
# Generator
# -----------------------------
BATCH_SIZE = 16

def generator_images(objs, batch_size=BATCH_SIZE, augment=False):
    while True:
        np.random.shuffle(objs)
        for i in range(0, len(objs), batch_size):
            group = objs[i:i+batch_size]
            features, labels, sample_weights = [], [], []
            for filename, obj in group:
                feat = load_image_features(filename, augment=augment)
                features.append(feat)
                
                prob = np.zeros(NUM_CATEGORIES, dtype=np.float32)
                prob[obj.category] = 1.0
                labels.append(prob)
                sample_weights.append(float(class_weights[obj.category]))
            
            X = tf.convert_to_tensor(np.stack(features), dtype=tf.float32)
            y = tf.convert_to_tensor(np.stack(labels), dtype=tf.float32)
            sw = tf.convert_to_tensor(np.stack(sample_weights), dtype=tf.float32)
            yield X, y, sw

train_generator = generator_images(objs_train, augment=True)
valid_generator = generator_images(objs_valid, augment=False)

# -----------------------------
# FFNN Model
# -----------------------------
input_dim = 32*32 + 8
model = Sequential([
    Input(shape=(input_dim,)),
    Dense(1024, kernel_initializer=HeNormal()),
    BatchNormalization(),
    LeakyReLU(0.1),
    Dropout(0.4),
    
    Dense(512, kernel_initializer=HeNormal()),
    BatchNormalization(),
    LeakyReLU(0.1),
    Dropout(0.3),
    
    Dense(256, kernel_initializer=HeNormal()),
    BatchNormalization(),
    LeakyReLU(0.1),
    Dropout(0.2),
    
    Dense(128, kernel_initializer=HeNormal()),
    BatchNormalization(),
    LeakyReLU(0.1),
    Dropout(0.2),
    
    Dense(NUM_CATEGORIES, activation='softmax')
])
model.summary()

# -----------------------------
# Compile
# -----------------------------
opt = Adam(learning_rate=1e-3)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

# -----------------------------
# Callbacks
# -----------------------------
callbacks = [
    ModelCheckpoint('model.keras', monitor='val_accuracy', save_best_only=True, verbose=1),
    ReduceLROnPlateau('val_accuracy', factor=0.1, patience=10, verbose=1),
    EarlyStopping('val_accuracy', patience=20, verbose=1),
    TerminateOnNaN()
]

# -----------------------------
# Training
# -----------------------------
EPOCHS = 30
train_steps = math.ceil(len(objs_train)/BATCH_SIZE)
valid_steps = math.ceil(len(objs_valid)/BATCH_SIZE)

history = model.fit(
    train_generator,
    steps_per_epoch=train_steps,
    validation_data=valid_generator,
    validation_steps=valid_steps,
    epochs=EPOCHS,
    callbacks=callbacks,
    verbose=1
)

best_idx = int(np.argmax(history.history.get('val_accuracy', [0])))
best_value = np.max(history.history.get('val_accuracy', [0]))
print(f'Best validation model: epoch {best_idx+1} - val_accuracy {best_value:.4f}')

Training images: 16871 Validation images: 1875
Class counts: [ 592 2973 1557 1981  963  632 1118  726 3248  103 1320 1386  272]
Class weights: {0: 2.1921777546777546, 1: 0.4365184092732024, 2: 0.8335062496912208, 3: 0.6551081427406515, 4: 1.3476315999680486, 5: 2.0534323271665045, 6: 1.1607953763588825, 7: 1.7875609239245602, 8: 0.399559492231906, 9: 12.599701269604182, 10: 0.9831585081585081, 11: 0.9363414363414363, 12: 4.771210407239819}


Epoch 1/30
[1m1055/1055[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step - accuracy: 0.1542 - loss: 2.4340
Epoch 1: val_accuracy improved from -inf to 0.26667, saving model to model.keras
[1m1055/1055[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 109ms/step - accuracy: 0.1542 - loss: 2.4338 - val_accuracy: 0.2667 - val_loss: 1.8923 - learning_rate: 0.0010
Epoch 2/30
[1m1054/1055[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 71ms/step - accuracy: 0.2537 - loss: 2.0320
Epoch 2: val_accuracy improved from 0.26667 to 0.32693, saving model to model.keras
[1m1055/1055[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 77ms/step - accuracy: 0.2537 - loss: 2.0320 - val_accuracy: 0.3269 - val_loss: 1.8592 - learning_rate: 0.0010
Epoch 3/30
[1m1054/1055[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 70ms/step - accuracy: 0.2741 - loss: 1.9517
Epoch 3: val_accuracy improved from 0.32693 to 0.34560, saving model to model.keras
[1m1055/1055[0m [32

Ok now we increased accuracy to 0.44.
Now we introduce the following changes to further boost accuracy:
- Context padding: crops now include surroundings.
- Bounding box features: (x1, y1, x2, y2) normalized appended to features.
- Gaussian noise: injected in preprocessing and in the model (for stability).
- Model: first layer expanded to 1280 neurons, added GaussianNoise layer.
- This should give you a more robust, context-aware model - likely better than 0.44 accuracy.

In [None]:
import os
import uuid
import math
import json
import numpy as np
from PIL import Image, ImageEnhance
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, LeakyReLU, GaussianNoise
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import HeNormal
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping, TerminateOnNaN

# -----------------------------
# SETUP
# -----------------------------
IMAGES_PATH = '/kaggle/input/xview-recognition'
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# -----------------------------
# Categories
# -----------------------------
categories = {
    0: 'Cargo plane', 1: 'Small car', 2: 'Bus', 3: 'Truck',
    4: 'Motorboat', 5: 'Fishing vessel', 6: 'Dump truck', 7: 'Excavator',
    8: 'Building', 9: 'Helipad', 10: 'Storage tank', 11: 'Shipping container',
    12: 'Pylon'
}
NUM_CATEGORIES = len(categories)
category_to_index = {v: k for k, v in categories.items()}

# -----------------------------
# Data classes
# -----------------------------
class GenericObject:
    def __init__(self):
        self.id = uuid.uuid4()
        self.bb = (-1, -1, -1, -1)
        self.category = -1
        self.score = -1

class GenericImage:
    def __init__(self, filename):
        self.filename = filename
        self.tile = np.array([-1, -1, -1, -1])
        self.objects = []

    def add_object(self, obj: GenericObject):
        self.objects.append(obj)

# -----------------------------
# Load image
# -----------------------------
def load_geoimage(filename):
    warnings.filterwarnings('ignore', category=rasterio.errors.NotGeoreferencedWarning)
    full_path = os.path.join(IMAGES_PATH, filename)
    if not os.path.exists(full_path):
        raise FileNotFoundError(f"Image not found: {full_path}")

    src_raster = rasterio.open(full_path, 'r')
    img = np.zeros((src_raster.height, src_raster.width, src_raster.count), dtype=np.float32)
    for band in range(src_raster.count):
        img[:, :, band] = src_raster.read(band + 1)
    return img

# -----------------------------
# Load images with bbox + context
# -----------------------------
def load_image_features(filename, obj, target_size=(32,32), augment=False, context_factor=1.5):
    full_path = os.path.join(IMAGES_PATH, filename)
    img = Image.open(full_path).convert("RGB")
    w, h = img.size
    
    # Expand bbox by context factor
    x1, y1, x2, y2 = obj.bb
    bw, bh = x2 - x1, y2 - y1
    cx, cy = (x1 + x2) / 2, (y1 + y2) / 2
    pad_w, pad_h = bw * context_factor / 2, bh * context_factor / 2
    
    new_x1 = max(0, int(cx - pad_w))
    new_y1 = max(0, int(cy - pad_h))
    new_x2 = min(w, int(cx + pad_w))
    new_y2 = min(h, int(cy + pad_h))
    
    crop = img.crop((new_x1, new_y1, new_x2, new_y2))
    
    if augment:
        crop = ImageEnhance.Brightness(crop).enhance(np.random.uniform(0.8, 1.2))
        crop = ImageEnhance.Contrast(crop).enhance(np.random.uniform(0.8, 1.2))
    
    crop = crop.convert("L")
    crop = crop.resize(target_size)
    img_array = np.array(crop).astype(np.float32) / 255.0
    img_array = img_array.flatten()
    
    # Optional histogram
    hist = np.histogram(img_array, bins=8, range=(0,1))[0].astype(np.float32)
    hist /= np.sum(hist)
    
    # Normalized bbox coords
    norm_bb = np.array([x1/w, y1/h, x2/w, y2/h], dtype=np.float32)
    
    # Gaussian noise injection (only in training)
    if augment:
        img_array += np.random.normal(0, 0.01, img_array.shape)
        norm_bb += np.random.normal(0, 0.01, norm_bb.shape)
    
    features = np.concatenate([img_array, hist, norm_bb])
    return features

# -----------------------------
# Load annotations
# -----------------------------
json_file = os.path.join(IMAGES_PATH, 'xview_ann_train.json')
with open(json_file) as ifs:
    json_data = json.load(ifs)

anns_dataset = []
for json_img, json_ann in zip(json_data['images'].values(), json_data['annotations'].values()):
    image = GenericImage(json_img['filename'])
    obj = GenericObject()
    obj.bb = tuple(map(int, json_ann['bbox']))
    obj.category = json_ann['category_id']
    
    if isinstance(obj.category, str):
        obj.category = category_to_index[obj.category]
    
    image.add_object(obj)
    anns_dataset.append(image)

# -----------------------------
# Split dataset
# -----------------------------
anns_train, anns_valid = train_test_split(
    anns_dataset, test_size=0.1, random_state=RANDOM_SEED, shuffle=True
)
print('Training images:', len(anns_train), 'Validation images:', len(anns_valid))

objs_train = [(ann.filename, obj) for ann in anns_train for obj in ann.objects]
objs_valid = [(ann.filename, obj) for ann in anns_valid for obj in ann.objects]

# -----------------------------
# Compute class weights
# -----------------------------
def compute_class_weights(objs):
    counts = np.zeros(NUM_CATEGORIES, dtype=np.int64)
    for _, obj in objs:
        cat_idx = obj.category if isinstance(obj.category, int) else category_to_index[obj.category]
        counts[cat_idx] += 1
    counts = np.maximum(counts, 1)
    class_weights = {i: float(np.sum(counts)) / (len(counts) * counts[i]) for i in range(len(counts))}
    return class_weights, counts

class_weights, class_counts = compute_class_weights(objs_train)
print("Class counts:", class_counts)
print("Class weights:", class_weights)

# -----------------------------
# Generator
# -----------------------------
BATCH_SIZE = 16

def generator_images(objs, batch_size=BATCH_SIZE, augment=False):
    while True:
        np.random.shuffle(objs)
        for i in range(0, len(objs), batch_size):
            group = objs[i:i+batch_size]
            features, labels, sample_weights = [], [], []
            for filename, obj in group:
                feat = load_image_features(filename, obj, augment=augment)
                features.append(feat)
                
                prob = np.zeros(NUM_CATEGORIES, dtype=np.float32)
                prob[obj.category] = 1.0
                labels.append(prob)
                sample_weights.append(float(class_weights[obj.category]))
            
            X = tf.convert_to_tensor(np.stack(features), dtype=tf.float32)
            y = tf.convert_to_tensor(np.stack(labels), dtype=tf.float32)
            sw = tf.convert_to_tensor(np.stack(sample_weights), dtype=tf.float32)
            yield X, y, sw

train_generator = generator_images(objs_train, augment=True)
valid_generator = generator_images(objs_valid, augment=False)

# -----------------------------
# FFNN Model (with GaussianNoise layer)
# -----------------------------
input_dim = 32*32 + 8 + 4
model = Sequential([
    Input(shape=(input_dim,)),
    GaussianNoise(0.01),  # noise robustness
    Dense(1280, kernel_initializer=HeNormal()),
    BatchNormalization(),
    LeakyReLU(0.1),
    Dropout(0.4),
    
    Dense(512, kernel_initializer=HeNormal()),
    BatchNormalization(),
    LeakyReLU(0.1),
    Dropout(0.3),
    
    Dense(256, kernel_initializer=HeNormal()),
    BatchNormalization(),
    LeakyReLU(0.1),
    Dropout(0.2),
    
    Dense(128, kernel_initializer=HeNormal()),
    BatchNormalization(),
    LeakyReLU(0.1),
    Dropout(0.2),
    
    Dense(NUM_CATEGORIES, activation='softmax')
])
model.summary()

# -----------------------------
# Compile
# -----------------------------
opt = Adam(learning_rate=1e-3)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

# -----------------------------
# Callbacks
# -----------------------------
callbacks = [
    ModelCheckpoint('model.keras', monitor='val_accuracy', save_best_only=True, verbose=1),
    ReduceLROnPlateau('val_accuracy', factor=0.1, patience=10, verbose=1),
    EarlyStopping('val_accuracy', patience=20, verbose=1),
    TerminateOnNaN()
]

# -----------------------------
# Training
# -----------------------------
EPOCHS = 30
train_steps = math.ceil(len(objs_train)/BATCH_SIZE)
valid_steps = math.ceil(len(objs_valid)/BATCH_SIZE)

history = model.fit(
    train_generator,
    steps_per_epoch=train_steps,
    validation_data=valid_generator,
    validation_steps=valid_steps,
    epochs=EPOCHS,
    callbacks=callbacks,
    verbose=1
)

best_idx = int(np.argmax(history.history.get('val_accuracy', [0])))
best_value = np.max(history.history.get('val_accuracy', [0]))
print(f'Best validation model: epoch {best_idx+1} - val_accuracy {best_value:.4f}')


2025-10-05 11:51:41.883407: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759665102.268296      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759665102.376204      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Training images: 16871 Validation images: 1875
Class counts: [ 592 2973 1557 1981  963  632 1118  726 3248  103 1320 1386  272]
Class weights: {0: 2.1921777546777546, 1: 0.4365184092732024, 2: 0.8335062496912208, 3: 0.6551081427406515, 4: 1.3476315999680486, 5: 2.0534323271665045, 6: 1.1607953763588825, 7: 1.7875609239245602, 8: 0.399559492231906, 9: 12.599701269604182, 10: 0.9831585081585081, 11: 0.9363414363414363, 12: 4.771210407239819}


I0000 00:00:1759665121.144927      36 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1759665121.145650      36 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [None]:
def draw_confusion_matrix(cm, categories):
    # Draw confusion matrix
    fig = plt.figure(figsize=[6.4*pow(len(categories), 0.5), 4.8*pow(len(categories), 0.5)])
    ax = fig.add_subplot(111)
    cm = cm.astype('float') / np.maximum(cm.sum(axis=1)[:, np.newaxis], np.finfo(np.float64).eps)
    im = ax.imshow(cm, interpolation='nearest', cmap=plt.colormaps['Blues'])
    ax.figure.colorbar(im, ax=ax)
    ax.set(xticks=np.arange(cm.shape[1]), yticks=np.arange(cm.shape[0]), xticklabels=list(categories.values()), yticklabels=list(categories.values()), ylabel='Annotation', xlabel='Prediction')
    # Rotate the tick labels and set their alignment
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
    # Loop over data dimensions and create text annotations
    thresh = cm.max() / 2.0
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], '.2f'), ha="center", va="center", color="white" if cm[i, j] > thresh else "black", fontsize=int(20-pow(len(categories), 0.5)))
    fig.tight_layout()
    plt.show()

model.load_weights('model.keras')
y_true, y_pred = [], []
for ann in anns_valid:
    # Load image
    image = load_geoimage(ann.filename)
    for obj_pred in ann.objects:
        # Generate prediction
        warped_image = np.expand_dims(image, 0)
        predictions = model.predict(warped_image, verbose=0)
        # Save prediction
        pred_category = list(categories.values())[np.argmax(predictions)]
        pred_score = np.max(predictions)
        y_true.append(obj_pred.category)
        y_pred.append(pred_category)

# Compute the confusion matrix
cm = confusion_matrix(y_true, y_pred, labels=list(categories.values()))
draw_confusion_matrix(cm, categories)

# Compute the accuracy
correct_samples_class = np.diag(cm).astype(float)
total_samples_class = np.sum(cm, axis=1).astype(float)
total_predicts_class = np.sum(cm, axis=0).astype(float)
print('Mean Accuracy: %.3f%%' % (np.sum(correct_samples_class) / np.sum(total_samples_class) * 100))
acc = correct_samples_class / np.maximum(total_samples_class, np.finfo(np.float64).eps)
print('Mean Recall: %.3f%%' % (acc.mean() * 100))
acc = correct_samples_class / np.maximum(total_predicts_class, np.finfo(np.float64).eps)
print('Mean Precision: %.3f%%' % (acc.mean() * 100))
for idx in range(len(categories)):
    # True/False Positives (TP/FP) refer to the number of predicted positives that were correct/incorrect.
    # True/False Negatives (TN/FN) refer to the number of predicted negatives that were correct/incorrect.
    tp = cm[idx, idx]
    fp = sum(cm[:, idx]) - tp
    fn = sum(cm[idx, :]) - tp
    tn = sum(np.delete(sum(cm) - cm[idx, :], idx))
    # True Positive Rate: proportion of real positive cases that were correctly predicted as positive.
    recall = tp / np.maximum(tp+fn, np.finfo(np.float64).eps)
    # Precision: proportion of predicted positive cases that were truly real positives.
    precision = tp / np.maximum(tp+fp, np.finfo(np.float64).eps)
    # True Negative Rate: proportion of real negative cases that were correctly predicted as negative.
    specificity = tn / np.maximum(tn+fp, np.finfo(np.float64).eps)
    # Dice coefficient refers to two times the intersection of two sets divided by the sum of their areas.
    # Dice = 2 |A∩B| / (|A|+|B|) = 2 TP / (2 TP + FP + FN)
    f1_score = 2 * ((precision * recall) / np.maximum(precision+recall, np.finfo(np.float64).eps))
    print('> %s: Recall: %.3f%% Precision: %.3f%% Specificity: %.3f%% Dice: %.3f%%' % (list(categories.values())[idx], recall*100, precision*100, specificity*100, f1_score*100))