In [1]:
import os
import cv2
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle

# LOADING DATA & PREPARATION

train_csv_path = "/kaggle/input/UBC-OCEAN/train.csv"
train_images_path = "/kaggle/input/UBC-OCEAN/train_thumbnails"
test_csv_path = "/kaggle/input/UBC-OCEAN/test.csv"
test_images_path = "/kaggle/input/UBC-OCEAN/test_images"
submission_path = "/kaggle/working/submission.csv"

train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)


# Working on only thumbnails 
existing_images = set(img.split('_')[0] for img in os.listdir(train_images_path))
print( len(existing_images))
train_df['image_id'] = train_df['image_id'].astype(str)
print(train_df.count())
train_df = train_df[train_df['image_id'].isin(existing_images)]
print(train_df.count())

labels = train_df['label']
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)
train_df['encoded_labels'] = encoded_labels

def preprocess_image(image_path, target_size=(128, 128)):
    if os.path.exists(image_path):
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, target_size)
        image = image / 255.0  # Normalize pixel values to [0, 1]
        print(f"LOG: prepricesing img {image_path}")
        return image
    else:
        return None

train_df['image_id'] = train_df['image_id'].astype(str)

train_images = []
train_labels = []
for img_id in train_df['image_id']:
    img_path = os.path.join(train_images_path, f"{img_id}_thumbnail.png")
    image = preprocess_image(img_path)
    if image is not None:
        train_images.append(image)
        train_labels.append(train_df[train_df['image_id'] == img_id]['encoded_labels'].values[0])

train_images = np.array(train_images)
train_labels = np.array(train_labels)

train_images, train_labels = shuffle(train_images, train_labels, random_state=42)

train_images, val_images, train_labels, val_labels = train_test_split(train_images, train_labels, test_size=0.2, random_state=42)

train_images_flatten = train_images.reshape(train_images.shape[0], -1)
val_images_flatten = val_images.reshape(val_images.shape[0], -1)

# MODEL

model = RandomForestClassifier(n_estimators=100, random_state=42)

print("FITTING THE MODEL")
model.fit(train_images_flatten, train_labels)
val_predictions = model.predict(val_images_flatten)

# Accuracy
accuracy = accuracy_score(val_labels, val_predictions)
print(f"Validation Accuracy: {accuracy}")

513
image_id        538
label           538
image_width     538
image_height    538
is_tma          538
dtype: int64
image_id        513
label           513
image_width     513
image_height    513
is_tma          513
dtype: int64
LOG: prepricesing img /kaggle/input/UBC-OCEAN/train_thumbnails/4_thumbnail.png
LOG: prepricesing img /kaggle/input/UBC-OCEAN/train_thumbnails/66_thumbnail.png
LOG: prepricesing img /kaggle/input/UBC-OCEAN/train_thumbnails/281_thumbnail.png
LOG: prepricesing img /kaggle/input/UBC-OCEAN/train_thumbnails/286_thumbnail.png
LOG: prepricesing img /kaggle/input/UBC-OCEAN/train_thumbnails/431_thumbnail.png
LOG: prepricesing img /kaggle/input/UBC-OCEAN/train_thumbnails/706_thumbnail.png
LOG: prepricesing img /kaggle/input/UBC-OCEAN/train_thumbnails/970_thumbnail.png
LOG: prepricesing img /kaggle/input/UBC-OCEAN/train_thumbnails/1020_thumbnail.png
LOG: prepricesing img /kaggle/input/UBC-OCEAN/train_thumbnails/1080_thumbnail.png
LOG: prepricesing img /kaggle/input/UBC-OC

In [2]:
# ON TEST SET

test_df['image_id'] = test_df['image_id'].astype(str)

test_images = []
for img_id in test_df['image_id']:
    print(test_images_path)
    img_path = os.path.join(test_images_path, f"{img_id}.png")
    print(img_path)
    image = preprocess_image(img_path)
    if image is not None:
        test_images.append(image)

test_images = np.array(test_images)
test_images_flatten = test_images.reshape(test_images.shape[0], -1)

test_predictions = model.predict(test_images_flatten)
predicted_labels = encoder.inverse_transform(test_predictions)

# SUBMISSION

submission_df = pd.DataFrame({'image_id': test_df['image_id'], 'label': predicted_labels})
submission_df.to_csv(submission_path, index=False)


/kaggle/input/UBC-OCEAN/test_images
/kaggle/input/UBC-OCEAN/test_images/41.png
LOG: prepricesing img /kaggle/input/UBC-OCEAN/test_images/41.png
