In [1]:
import numpy as np, pandas as pd   

In [2]:
test_dir='/kaggle/input/byu-locating-bacterial-flagellar-motors-2025/test/*/*.jpg'
train_dir='/kaggle/input/byu-locating-bacterial-flagellar-motors-2025/train/*/'
train_labels=pd.read_csv(r'/kaggle/input/byu-locating-bacterial-flagellar-motors-2025/train_labels.csv')
sample=pd.read_csv(r'/kaggle/input/byu-locating-bacterial-flagellar-motors-2025/sample_submission.csv')

In [3]:
train_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 737 entries, 0 to 736
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   row_id                737 non-null    int64  
 1   tomo_id               737 non-null    object 
 2   Motor axis 0          737 non-null    float64
 3   Motor axis 1          737 non-null    float64
 4   Motor axis 2          737 non-null    float64
 5   Array shape (axis 0)  737 non-null    int64  
 6   Array shape (axis 1)  737 non-null    int64  
 7   Array shape (axis 2)  737 non-null    int64  
 8   Voxel spacing         737 non-null    float64
 9   Number of motors      737 non-null    int64  
dtypes: float64(4), int64(5), object(1)
memory usage: 57.7+ KB


In [4]:
train_labels.head()

Unnamed: 0,row_id,tomo_id,Motor axis 0,Motor axis 1,Motor axis 2,Array shape (axis 0),Array shape (axis 1),Array shape (axis 2),Voxel spacing,Number of motors
0,0,tomo_003acc,-1.0,-1.0,-1.0,500,1912,1847,6.5,0
1,1,tomo_00e047,169.0,546.0,603.0,300,959,928,15.6,1
2,2,tomo_00e463,235.0,403.0,137.0,500,924,956,19.7,6
3,3,tomo_00e463,243.0,363.0,153.0,500,924,956,19.7,6
4,4,tomo_00e463,222.0,379.0,144.0,500,924,956,19.7,6


In [5]:
# Imports
import os
import glob
import numpy as np
import pandas as pd
import cv2
import tensorflow as tf
from sklearn.model_selection import train_test_split
from collections import defaultdict

# Configuration
IMG_SIZE = (64, 64)
DATA_DIR = '/kaggle/input/byu-locating-bacterial-flagellar-motors-2025'
TRAIN_DIR_PATTERN = os.path.join(DATA_DIR, 'train/*/')
TEST_SLICE_PATTERN = os.path.join(DATA_DIR, 'test/*/*.jpg')
LABEL_PATH = os.path.join(DATA_DIR, 'train_labels.csv')

# Load training labels
train_labels = pd.read_csv(LABEL_PATH)

# Load one slice per training tomogram
def load_motor_slices(train_dirs, labels_df):
    images, coords = [], []
    for tomo_path in glob.glob(train_dirs):
        tomo_id = os.path.basename(os.path.normpath(tomo_path))
        label_row = labels_df[labels_df['tomo_id'] == tomo_id]
        if label_row.empty:
            continue
        label = label_row.iloc[0]
        z = int(round(label['Motor axis 0']))
        x = label['Motor axis 2']
        y = label['Motor axis 1']
        slice_paths = sorted(glob.glob(os.path.join(tomo_path, '*.jpg')))
        if len(slice_paths) == 0 or z >= len(slice_paths):
            continue
        img = cv2.imread(slice_paths[z], cv2.IMREAD_GRAYSCALE)
        img = cv2.resize(img, IMG_SIZE)
        img = img / 255.0
        images.append(img)
        coords.append([x / IMG_SIZE[1], y / IMG_SIZE[0]])
    return np.expand_dims(np.array(images), -1), np.array(coords)

# Load and split data
X, y = load_motor_slices(TRAIN_DIR_PATTERN, train_labels)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Lightweight 2D CNN
def build_fast_model(input_shape=(64, 64, 1)):
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=input_shape),
        tf.keras.layers.Conv2D(16, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(2, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

# Train model
model = build_fast_model()
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, batch_size=32, verbose=2)

# Save model
model.save('/kaggle/working/motor_locator_2dcnn.h5')

# Efficient test loader and predictor
def load_and_preprocess_image(path):
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, IMG_SIZE)
    return img / 255.0

def group_test_by_tomo_memory_efficient(test_pattern):
    grouped = defaultdict(list)
    for path in sorted(glob.glob(test_pattern)):
        tomo_id = os.path.basename(os.path.dirname(path))
        grouped[tomo_id].append(path)
    return grouped

def prepare_submission_memory_efficient(model, test_pattern, threshold=120):
    grouped = group_test_by_tomo_memory_efficient(test_pattern)
    submission = []

    for tomo_id, slice_paths in grouped.items():
        preds = []
        for path in slice_paths:
            img = load_and_preprocess_image(path)
            img = np.expand_dims(img, axis=(0, -1))  # Shape: (1, 64, 64, 1)
            pred = model.predict(img, verbose=0)[0]
            pred_scaled = pred * np.array([IMG_SIZE[1], IMG_SIZE[0]])
            preds.append(pred_scaled)

        preds = np.array(preds)
        norms = np.linalg.norm(preds, axis=1)
        best_idx = np.argmin(norms)

        if norms[best_idx] > threshold:
            submission.append([tomo_id, -1, -1, -1])
        else:
            y, x = preds[best_idx]
            submission.append([tomo_id, best_idx, y, x])

    return submission

# Generate and verify submission
submission = prepare_submission_memory_efficient(model, TEST_SLICE_PATTERN)
submission_df = pd.DataFrame(submission, columns=['tomo_id', 'Motor axis 0', 'Motor axis 1', 'Motor axis 2'])

# Validation and cleanup
test_paths = glob.glob(TEST_SLICE_PATTERN)
expected_tomo_ids = set([os.path.basename(os.path.dirname(p)) for p in test_paths])
submission_df = submission_df[submission_df['tomo_id'].isin(expected_tomo_ids)].drop_duplicates(subset=['tomo_id'])

# Clip invalid slice indices
grouped_test = defaultdict(list)
for p in test_paths:
    tomo = os.path.basename(os.path.dirname(p))
    grouped_test[tomo].append(p)

def clip_z(row):
    if row['Motor axis 0'] == -1:
        return -1
    max_idx = len(grouped_test[row['tomo_id']]) - 1
    return int(min(max(row['Motor axis 0'], 0), max_idx))

submission_df['Motor axis 0'] = submission_df.apply(clip_z, axis=1)
submission_df[['Motor axis 1', 'Motor axis 2']] = submission_df[['Motor axis 1', 'Motor axis 2']].round(2)

# Save final cleaned submission
submission_df.to_csv('/kaggle/working/submission.csv', index=False)
print("\n✅ Cleaned submission saved. Preview:")
print(submission_df.head())


Epoch 1/20
17/17 - 2s - 135ms/step - loss: 33.6505 - mae: 4.1845 - val_loss: 32.6236 - val_mae: 4.2051
Epoch 2/20
17/17 - 1s - 35ms/step - loss: 32.9150 - mae: 4.1734 - val_loss: 31.4119 - val_mae: 4.1750
Epoch 3/20
17/17 - 1s - 35ms/step - loss: 31.3810 - mae: 4.1571 - val_loss: 29.8427 - val_mae: 4.1282
Epoch 4/20
17/17 - 1s - 35ms/step - loss: 30.4558 - mae: 4.1404 - val_loss: 29.5605 - val_mae: 4.1176
Epoch 5/20
17/17 - 1s - 35ms/step - loss: 30.3748 - mae: 4.1403 - val_loss: 29.5376 - val_mae: 4.1166
Epoch 6/20
17/17 - 1s - 34ms/step - loss: 30.3657 - mae: 4.1402 - val_loss: 29.5337 - val_mae: 4.1164
Epoch 7/20
17/17 - 1s - 35ms/step - loss: 30.3637 - mae: 4.1402 - val_loss: 29.5322 - val_mae: 4.1164
Epoch 8/20
17/17 - 1s - 35ms/step - loss: 30.3627 - mae: 4.1402 - val_loss: 29.5312 - val_mae: 4.1163
Epoch 9/20
17/17 - 1s - 37ms/step - loss: 30.3621 - mae: 4.1402 - val_loss: 29.5306 - val_mae: 4.1163
Epoch 10/20
17/17 - 1s - 35ms/step - loss: 30.3616 - mae: 4.1401 - val_loss: 29.5