In [None]:
!pip install -q autogluon
!pip install -q scikit-learn==1.3.2 


In [None]:
import sklearn
print('scikit-learn version:', sklearn.__version__)

In [None]:
import os
import glob
import numpy as np
import pandas as pd
import cv2
from sklearn.model_selection import train_test_split
from collections import defaultdict
from autogluon.multimodal import MultiModalPredictor

# Configuration
IMG_SIZE = (64, 64)
DATA_DIR = '/kaggle/input/byu-locating-bacterial-flagellar-motors-2025'
TRAIN_DIR_PATTERN = os.path.join(DATA_DIR, 'train/*/')
TEST_SLICE_PATTERN = os.path.join(DATA_DIR, 'test/*/*.jpg')
LABEL_PATH = os.path.join(DATA_DIR, 'train_labels.csv')

# Load training labels
train_labels = pd.read_csv(LABEL_PATH)

# Load one slice per training tomogram
def load_motor_slices(train_dirs, labels_df):
    images, coords, ids = [], [], []
    for tomo_path in glob.glob(train_dirs):
        tomo_id = os.path.basename(os.path.normpath(tomo_path))
        label_row = labels_df[labels_df['tomo_id'] == tomo_id]
        if label_row.empty:
            continue
        label = label_row.iloc[0]
        z = int(round(label['Motor axis 0']))
        x = label['Motor axis 2']
        y = label['Motor axis 1']
        slice_paths = sorted(glob.glob(os.path.join(tomo_path, '*.jpg')))
        if len(slice_paths) == 0 or z >= len(slice_paths):
            continue
        img = cv2.imread(slice_paths[z], cv2.IMREAD_GRAYSCALE)
        img = cv2.resize(img, IMG_SIZE)
        img_path = f'/kaggle/working/temp_train/{tomo_id}_slice.jpg'
        cv2.imwrite(img_path, img)
        images.append(img_path)
        coords.append([x / IMG_SIZE[1], y / IMG_SIZE[0]])
        ids.append(tomo_id)
    return images, coords, ids

# Create temporary directory for processed training images
os.makedirs('/kaggle/working/temp_train', exist_ok=True)

# Load and split data
X_paths, y_coords, ids = load_motor_slices(TRAIN_DIR_PATTERN, train_labels)
train_df = pd.DataFrame({
    'image': X_paths,
    'Motor axis 1': [coord[0] for coord in y_coords],
    'Motor axis 2': [coord[1] for coord in y_coords],
})

train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42)

# Train with AutoGluon
predictor = MultiModalPredictor(label=['Motor axis 1', 'Motor axis 2'], problem_type='regression')
predictor.fit(train_data, time_limit=1800, hyperparameters={"model.names": ["resnet"]})

# Save predictor
predictor.save('/kaggle/working/motor_locator_autogluon')

# Efficient test loader
def load_and_preprocess_image_to_tmp(path):
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, IMG_SIZE)
    tmp_path = '/kaggle/working/temp_test/' + os.path.basename(path)
    cv2.imwrite(tmp_path, img)
    return tmp_path

# Prepare submission
def group_test_by_tomo(test_pattern):
    grouped = defaultdict(list)
    for path in sorted(glob.glob(test_pattern)):
        tomo_id = os.path.basename(os.path.dirname(path))
        grouped[tomo_id].append(path)
    return grouped

os.makedirs('/kaggle/working/temp_test', exist_ok=True)

def prepare_submission_autogluon(predictor, test_pattern, threshold=120):
    grouped = group_test_by_tomo(test_pattern)
    submission = []

    for tomo_id, slice_paths in grouped.items():
        tmp_test = []
        tmp_paths = []
        for path in slice_paths:
            processed_path = load_and_preprocess_image_to_tmp(path)
            tmp_test.append({'image': processed_path})
            tmp_paths.append(path)

        test_df = pd.DataFrame(tmp_test)
        preds = predictor.predict(test_df).to_numpy()
        preds = preds * np.array([IMG_SIZE[1], IMG_SIZE[0]])

        norms = np.linalg.norm(preds, axis=1)
        best_idx = np.argmin(norms)

        if norms[best_idx] > threshold:
            submission.append([tomo_id, -1, -1, -1])
        else:
            y, x = preds[best_idx]
            submission.append([tomo_id, best_idx, y, x])

    return submission

# Generate submission
submission = prepare_submission_autogluon(predictor, TEST_SLICE_PATTERN)
submission_df = pd.DataFrame(submission, columns=['tomo_id', 'Motor axis 0', 'Motor axis 1', 'Motor axis 2'])

# Validate and cleanup
test_paths = glob.glob(TEST_SLICE_PATTERN)
expected_tomo_ids = set([os.path.basename(os.path.dirname(p)) for p in test_paths])
submission_df = submission_df[submission_df['tomo_id'].isin(expected_tomo_ids)].drop_duplicates(subset=['tomo_id'])

grouped_test = defaultdict(list)
for p in test_paths:
    tomo = os.path.basename(os.path.dirname(p))
    grouped_test[tomo].append(p)

def clip_z(row):
    if row['Motor axis 0'] == -1:
        return -1
    max_idx = len(grouped_test[row['tomo_id']]) - 1
    return int(min(max(row['Motor axis 0'], 0), max_idx))

submission_df['Motor axis 0'] = submission_df.apply(clip_z, axis=1)
submission_df[['Motor axis 1', 'Motor axis 2']] = submission_df[['Motor axis 1', 'Motor axis 2']].round(2)

# Save final submission
submission_df.to_csv('/kaggle/working/submission.csv', index=False)
print("\n✅ Cleaned submission saved. Preview:")
print(submission_df.head())


In [None]:
submission_df