# Data exploration

In [1]:
import re

import pandas as pd
import numpy as np
import os
from PIL import Image



In [43]:

# Path to your image folder
image_folder = os.path.join("..", "data", "images")
metadata_file = os.path.join("..", "data", "metadata.csv")

# 1. List all image files in the folder
image_files = [f for f in os.listdir(image_folder) if f.endswith('.jpg') or f.endswith('.png')]
print(f"there is a total of {image_files.__len__()} images")


there is a total of 15089 images


In [44]:
# create a list of all image files
image_files = list()
for dirpath, dirnames, filenames in os.walk(image_folder):
    for file in filenames:
        if file.lower().endswith('.jpg') or file.lower().endswith('.png'):
            image_files.append(os.path.join(dirpath, file))
image_files[:10]

['../data/images/7012189a-d8c9-4bd5-a339-01c59e069cbe__1.jpg',
 '../data/images/6434e2dc-4d94-4a19-bc1b-00feb6155d09__2.jpg',
 '../data/images/a1c08dea-63e0-4858-b230-098b3d954c04__2.jpg',
 '../data/images/aa76952f-2f56-42dd-9f0a-a4d55e7a0164__3.jpg',
 '../data/images/3f5158b6-f9a1-44d4-b52e-21ed792b90e1__2.jpg',
 '../data/images/3a9f875a-364f-45f4-b030-6942a0c6d3ba__2.jpg',
 '../data/images/605f9c52-c00f-47ac-a8d3-e1832ce1f534__2.jpg',
 '../data/images/e3fa23b1-a1f3-4ee1-bd30-560fab810f0c__2.jpg',
 '../data/images/d521b38b-4b28-4958-b784-d7b960244de8__4.jpg',
 '../data/images/4730a222-fc65-48e8-8280-586003fd52aa__2.jpg']

In [45]:
# just for experimenting, let's play with one file
#f = image_files[0]
data = list()
uuids = list() # create a list to store the uuids as files are being processed
columns = [ f"pixel_{str(i+1).rjust(7,'0')}" for i in range(0,1000*1000*3)]
for f in np.random.choice(image_files, size=1000, replace=False):
    with Image.open(f) as img:
        # ensure each image has 3 layers and the size is 1000x1000
        assert img.layers == 3
        assert img.size == (1000,1000)
        
        # convert to flattened np array
        img_array = np.array(img).flatten()
        
        # append to data list
        data.append(img_array)    
        
        # get the uuid from the filename
        uuid = re.split(r'["\\|\/"]', img.filename)[-1].split('__')[0]
        uuids.append(uuid)
                
# store the list of arrays in a matrix
X = np.array(data)

In [46]:
# now we need to convert the uuids into ages using the metadata!

metadata_df = pd.read_csv(metadata_file)
print(metadata_df.shape)
metadata_df.head()



(7828, 11)


Unnamed: 0,uuid,fish_id,age,length,weight,month,is_male,is_female,is_unknown,is_plaice,is_herring
0,388b0180-60bc-440d-bd61-446f8bab884a,CAR-025-62-1466,4,0.22,0.07,0.75,0,1,0,1,0
1,81aeec93-b96d-4e14-b3f4-f49c4a8c18c7,CAR-025-72-1692,6,0.2,0.057,0.75,1,0,0,1,0
2,230abf76-c986-423d-b569-087489990963,CAR-025-3-0022,5,0.24,0.09,0.75,0,1,0,1,0
3,c33448fa-5c76-4a3a-b070-619a623906bc,CAR-025-36-0832,6,0.23,0.091,0.75,1,0,0,1,0
4,f26dc915-152e-4e3c-86b3-03252497dbb1,TEL-232-73-0573,3,0.15,0.021,0.75,0,0,1,1,0


In [47]:

# create a lookup from uuid to age using df
uuid_age_lookup = dict()
for index, row in metadata_df.iterrows():
    uuid_age_lookup[row["uuid"]] = row["age"]
y = np.array([uuid_age_lookup[uuid] for uuid in uuids])
y.shape


(1000,)

In [32]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), slice(0,X.shape[0])),  # Standard scaling for numerical features
#     ])
# X = preprocessor.fit_transform(X)

In [48]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

# Train a Random Forest Regressor
rf = RandomForestRegressor(n_estimators=50, random_state=42)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)


# # 11. Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)
# 
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f'Root Mean Squared Error: {rmse}')
print(f"R-squared (R²): {r2:.2f}")


# takes 13 h to process 1000 images (1000x1000)


# from sklearn.metrics import accuracy_score

# accuracy_score(y_true=y_test, y_pred=y_pred)


Mean Squared Error (MSE): 4.41
Root Mean Squared Error: 2.0993832427644077
R-squared (R²): 0.40


# First question: what is an appropriate image size to use? The full size takes a really long time to run and I would expect the benefits of higher res will start to plateau at a certain point. 

In [51]:
# just for experimenting, let's play with one file
#f = image_files[0]

sizes = np.arange(25,1000, 15)

results = list()
sample = np.random.choice(image_files, size=100, replace=False)
for s in sizes:
    print(f"computing score for size {s}")
    data = list()
    uuids = list() # create a list to store the uuids as files are being processed
    columns = [ f"pixel_{str(i+1).rjust(7,'0')}" for i in range(0,s**2*3)]
    for f in sample:
        with Image.open(f) as img:
            # ensure each image has 3 layers and the size is 1000x1000
            assert img.layers == 3
            assert img.size == (1000,1000)
            img = img.resize((s,s))
            # convert to flattened np array
            img_array = np.array(img).flatten()
    
            # append to data list
            data.append(img_array)    
    
            # get the uuid from the filename
            uuid = re.split(r'["\\|\/"]', img.filename)[-1].split('__')[0]
            uuids.append(uuid)
    
    # store the list of arrays in a matrix
    X = np.array(data)

3000000

- using a small subsample, what happens when we build a model / classifier using different resolution sizes..  
- We'll want to make sure that the training dataset has good representation of the two species and each size class