# Data exploration

In [1]:
import re

import pandas as pd
import numpy as np
import os
from PIL import Image



In [43]:

# Path to your image folder
image_folder = os.path.join("..", "data", "images")
metadata_file = os.path.join("..", "data", "metadata.csv")

# 1. List all image files in the folder
image_files = [f for f in os.listdir(image_folder) if f.endswith('.jpg') or f.endswith('.png')]
print(f"there is a total of {image_files.__len__()} images")


there is a total of 15089 images


In [44]:
# create a list of all image files
image_files = list()
for dirpath, dirnames, filenames in os.walk(image_folder):
    for file in filenames:
        if file.lower().endswith('.jpg') or file.lower().endswith('.png'):
            image_files.append(os.path.join(dirpath, file))
image_files[:10]

['../data/images/7012189a-d8c9-4bd5-a339-01c59e069cbe__1.jpg',
 '../data/images/6434e2dc-4d94-4a19-bc1b-00feb6155d09__2.jpg',
 '../data/images/a1c08dea-63e0-4858-b230-098b3d954c04__2.jpg',
 '../data/images/aa76952f-2f56-42dd-9f0a-a4d55e7a0164__3.jpg',
 '../data/images/3f5158b6-f9a1-44d4-b52e-21ed792b90e1__2.jpg',
 '../data/images/3a9f875a-364f-45f4-b030-6942a0c6d3ba__2.jpg',
 '../data/images/605f9c52-c00f-47ac-a8d3-e1832ce1f534__2.jpg',
 '../data/images/e3fa23b1-a1f3-4ee1-bd30-560fab810f0c__2.jpg',
 '../data/images/d521b38b-4b28-4958-b784-d7b960244de8__4.jpg',
 '../data/images/4730a222-fc65-48e8-8280-586003fd52aa__2.jpg']

In [73]:

# we are going to want to do stuff with the metadata so let's create a df, with each row being an image and the associated attributes from the metadata
metadata_df = pd.read_csv(metadata_file)
print(metadata_df.shape)
metadata_df.head()

(7828, 11)


Unnamed: 0,uuid,fish_id,age,length,weight,month,is_male,is_female,is_unknown,is_plaice,is_herring
0,388b0180-60bc-440d-bd61-446f8bab884a,CAR-025-62-1466,4,0.22,0.07,0.75,0,1,0,1,0
1,81aeec93-b96d-4e14-b3f4-f49c4a8c18c7,CAR-025-72-1692,6,0.2,0.057,0.75,1,0,0,1,0
2,230abf76-c986-423d-b569-087489990963,CAR-025-3-0022,5,0.24,0.09,0.75,0,1,0,1,0
3,c33448fa-5c76-4a3a-b070-619a623906bc,CAR-025-36-0832,6,0.23,0.091,0.75,1,0,0,1,0
4,f26dc915-152e-4e3c-86b3-03252497dbb1,TEL-232-73-0573,3,0.15,0.021,0.75,0,0,1,1,0


In [78]:
def get_uuid_from_path(x):
    return re.split(r'["\\|\/"]', x)[-1].split('__')[0]


df = pd.DataFrame(dict(img_filepath=image_files))
df["uuid"] = df["img_filepath"].apply(get_uuid_from_path)
df = df.merge(metadata_df, on="uuid", how="left")
print(df.shape)
df.head()


(15089, 12)


Unnamed: 0,img_filepath,uuid,fish_id,age,length,weight,month,is_male,is_female,is_unknown,is_plaice,is_herring
0,../data/images/7012189a-d8c9-4bd5-a339-01c59e069cbe__1.jpg,7012189a-d8c9-4bd5-a339-01c59e069cbe,14141-022,9,0.3217,0.26,0.75,0,1,0,0,1
1,../data/images/6434e2dc-4d94-4a19-bc1b-00feb6155d09__2.jpg,6434e2dc-4d94-4a19-bc1b-00feb6155d09,13977-008,6,0.301,0.23961,0.666667,0,1,0,0,1
2,../data/images/a1c08dea-63e0-4858-b230-098b3d954c04__2.jpg,a1c08dea-63e0-4858-b230-098b3d954c04,TEL-232-54-0375,4,0.13,0.013,0.75,0,0,1,1,0
3,../data/images/aa76952f-2f56-42dd-9f0a-a4d55e7a0164__3.jpg,aa76952f-2f56-42dd-9f0a-a4d55e7a0164,CAR-025-31-0745,6,0.21,0.06,0.75,0,1,0,1,0
4,../data/images/3f5158b6-f9a1-44d4-b52e-21ed792b90e1__2.jpg,3f5158b6-f9a1-44d4-b52e-21ed792b90e1,CAR-025-46-1038,4,0.17,0.038,0.75,1,0,0,1,0


# First question: what is an appropriate image size to use? The full size takes a really long time to run and I would expect the benefits of higher res will start to plateau at a certain point. 

In [86]:
# for this, let's look at just one type of otolith

mask = df["is_plaice"] == 1
plaice_df = df[mask]
plaice_df.head()

Unnamed: 0,img_filepath,uuid,fish_id,age,length,weight,month,is_male,is_female,is_unknown,is_plaice,is_herring
2,../data/images/a1c08dea-63e0-4858-b230-098b3d954c04__2.jpg,a1c08dea-63e0-4858-b230-098b3d954c04,TEL-232-54-0375,4,0.13,0.013,0.75,0,0,1,1,0
3,../data/images/aa76952f-2f56-42dd-9f0a-a4d55e7a0164__3.jpg,aa76952f-2f56-42dd-9f0a-a4d55e7a0164,CAR-025-31-0745,6,0.21,0.06,0.75,0,1,0,1,0
4,../data/images/3f5158b6-f9a1-44d4-b52e-21ed792b90e1__2.jpg,3f5158b6-f9a1-44d4-b52e-21ed792b90e1,CAR-025-46-1038,4,0.17,0.038,0.75,1,0,0,1,0
8,../data/images/d521b38b-4b28-4958-b784-d7b960244de8__4.jpg,d521b38b-4b28-4958-b784-d7b960244de8,CAR-025-74-1785,4,0.16,0.026,0.75,1,0,0,1,0
12,../data/images/14e651a6-0a35-4e0d-bbfa-36a12b54d10d__2.jpg,14e651a6-0a35-4e0d-bbfa-36a12b54d10d,TEL-232-73-0555,7,0.31,0.241,0.75,0,0,1,1,0


In [102]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split

sample_size = 300
# let's select a random sample of `sample_size` images
sample = df.sample(n=sample_size, replace=False)

# define the sizes we want to test
sizes = np.arange(100, 1100, 100)

scores = list()
y = sample["age"]

for s in sizes:
    print(f"computing score for size {s}")
    data = list()
    columns = [f"pixel_{str(i + 1).rjust(7, '0')}" for i in range(0, s ** 2 * 3)]
    for index, row in sample.iterrows():
        filepath = row["img_filepath"]
        with Image.open(filepath) as img:
            # resize img appropriately
            img = img.resize((s, s))
            # convert to flattened np array
            img_array = np.array(img).flatten()
            # append to data list
            data.append(img_array)

    # store the list of arrays in a matrix
    X = np.array(data)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    rf = RandomForestClassifier(n_estimators=50, random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    # 11. Evaluate the model
    s = accuracy_score(y_test, y_pred)
    print(f"Accuracy score: {s}")

    # print(f"Mean Squared Error (MSE): {mse:.2f}")
    # print(f'Root Mean Squared Error: {rmse}')
    # print(f"R-squared (R²): {r2:.2f}")

computing score for size 100
Accuracy score: 0.16666666666666666
computing score for size 200
Accuracy score: 0.21666666666666667
computing score for size 300
Accuracy score: 0.21666666666666667
computing score for size 400
Accuracy score: 0.18333333333333332
computing score for size 500
Accuracy score: 0.23333333333333334
computing score for size 600
Accuracy score: 0.18333333333333332
computing score for size 700
Accuracy score: 0.26666666666666666
computing score for size 800
Accuracy score: 0.21666666666666667
computing score for size 900
Accuracy score: 0.21666666666666667
computing score for size 1000
Accuracy score: 0.23333333333333334


3000000

- using a small subsample, what happens when we build a model / classifier using different resolution sizes..  
- We'll want to make sure that the training dataset has good representation of the two species and each size class