# Ultility and Preparation

In [1]:
# !pip install kaggle
# !pip install kagglehub
# !pip install pandas
# !pip install matplotlib
# !pip install pytorch
# !pip install torchvision

In [2]:
from src.dataset import download_dataset, get_subset_data, closed_set_spliting

## A. Download the dataset

In [3]:
paths = download_dataset()
display(paths)

Dataset downloaded and extracted to: /Users/nhut/.cache/kagglehub/datasets/wildlifedatasets/seaturtleid2022/versions/4


{'path': '/Users/nhut/.cache/kagglehub/datasets/wildlifedatasets/seaturtleid2022/versions/4',
 'images_path': '/Users/nhut/.cache/kagglehub/datasets/wildlifedatasets/seaturtleid2022/versions/4/turtles-data/data',
 'annotations_path': '/Users/nhut/.cache/kagglehub/datasets/wildlifedatasets/seaturtleid2022/versions/4/turtles-data/data/annotations.json',
 'metadata': '/Users/nhut/.cache/kagglehub/datasets/wildlifedatasets/seaturtleid2022/versions/4/turtles-data/data/metadata.csv',
 'metadata_splits': '/Users/nhut/.cache/kagglehub/datasets/wildlifedatasets/seaturtleid2022/versions/4/turtles-data/data/metadata_splits.csv'}

## B. Create Subset of the dataset

In [4]:
SEED = 42

INDIVIDUALS = 10
MIN_ENCOUNTERS = 5

TRAIN_ENCOUNTERS = 3
VALID_ENCOUNTERS = 1
TEST_ENCOUNTERS = 1

OUT_DIR = './data/seaturtleid2022-subset'

In [5]:
min_encounter = TRAIN_ENCOUNTERS + VALID_ENCOUNTERS + TEST_ENCOUNTERS
df = get_subset_data(paths['metadata_splits'], out_dir=f'{OUT_DIR}', seed=SEED,
                     n_individuals=INDIVIDUALS,
                     min_encounters=min_encounter)

df.head()

Unnamed: 0,id,width,height,file_name,timestamp,identity,date,year,split_closed,split_closed_random,split_open,clarity
447,448,2000,1500,images/t023/AKOFtQyjtX.JPG,2011-07-30 13:49:47,t023,2011-07-30,2011,train,train,train,3
448,449,2000,1500,images/t023/CMTutQfpan.JPG,2013-08-03 17:26:02,t023,2013-08-03,2013,train,test,train,4
449,450,2000,1333,images/t023/CRedDLWUSY.JPG,2019-09-02 19:27:12,t023,2019-09-02,2019,valid,train,valid,1
450,451,2000,1333,images/t023/FjcFDshgIm.JPG,2019-09-02 19:20:56,t023,2019-09-02,2019,valid,valid,valid,1
451,452,2000,1500,images/t023/GSCUgXKmJm.JPG,2011-07-30 13:40:21,t023,2011-07-30,2011,train,valid,train,1


In [6]:
print('Identity in the dataset:', df['identity'].unique())
print('Number of Identity:', df['identity'].nunique())
print('Number of images:', len(df))

Identity in the dataset: ['t023' 't040' 't043' 't063' 't217' 't230' 't254' 't260' 't322' 't397']
Number of Identity: 10
Number of images: 620


## C. Closed-set Time-aware Spliting

In [7]:
dt_closed = closed_set_spliting(df, out_dir=f'{OUT_DIR}',
                    train_encounters=TRAIN_ENCOUNTERS,
                    valid_encounters=VALID_ENCOUNTERS,
                    test_encounters=TEST_ENCOUNTERS)

# Verify the filtering worked correctly
print("Dates per identity and split after filtering:")
dates_per_identity = dt_closed.groupby(['identity', 'split_closed'])['date'].nunique()
display(dates_per_identity)

print(f"\nTotal images after filtering: {len(dt_closed)}")
print("Images per identity and date:")
display(dt_closed.groupby(['identity', 'split_closed', 'date']).size())

Dates per identity and split after filtering:


  .apply(lambda g: _select_encounters_per_identity(


identity  split_closed
t023      test            1
          train           3
          valid           1
t040      test            1
          train           3
          valid           1
t043      test            1
          train           3
          valid           1
t063      test            1
          train           3
          valid           1
t217      test            1
          train           3
          valid           1
t230      test            1
          train           3
          valid           1
t254      test            1
          train           3
          valid           1
t260      test            1
          train           3
          valid           1
t322      test            1
          train           3
          valid           1
t397      test            1
          train           3
          valid           1
Name: date, dtype: int64


Total images after filtering: 359
Images per identity and date:


identity  split_closed  date      
t023      test          2022-07-01     4
          train         2011-07-30     8
                        2013-08-03     5
                        2016-10-02     4
          valid         2019-09-02     7
t040      test          2015-07-11     7
          train         2012-07-12     7
                        2015-06-25     5
                        2015-06-26     7
          valid         2015-06-30    10
t043      test          2018-07-02     4
          train         2012-07-17     5
                        2016-06-23    10
                        2016-07-02     8
          valid         2016-07-13     3
t063      test          2016-07-08     6
          train         2013-07-10     3
                        2015-07-06     9
                        2015-07-09     1
          valid         2016-07-01     4
t217      test          2018-08-22    10
          train         2016-09-12     6
                        2016-10-03     9
                      

In [8]:
display(dt_closed[dt_closed['split_closed'] == 'train'])
display(dt_closed[dt_closed['split_closed'] == 'valid'])
display(dt_closed[dt_closed['split_closed'] == 'test'])

Unnamed: 0,id,width,height,file_name,timestamp,identity,date,split_closed,clarity
0,448,2000,1500,images/t023/AKOFtQyjtX.JPG,2011-07-30 13:49:47,t023,2011-07-30,train,3
1,449,2000,1500,images/t023/CMTutQfpan.JPG,2013-08-03 17:26:02,t023,2013-08-03,train,4
2,452,2000,1500,images/t023/GSCUgXKmJm.JPG,2011-07-30 13:40:21,t023,2011-07-30,train,1
3,453,2000,1333,images/t023/GbMUtabafJ.JPG,2016-10-02 09:47:34,t023,2016-10-02,train,3
4,460,2000,1333,images/t023/RRmCBXTZGC.JPG,2016-10-02 09:39:54,t023,2016-10-02,train,3
...,...,...,...,...,...,...,...,...,...
338,7006,2000,1333,images/t397/oejSxSNVQD.JPG,2018-08-21 17:27:20,t397,2018-08-21,train,3
339,7008,2000,1333,images/t397/qnbNOHImwd.JPG,2018-08-21 09:28:12,t397,2018-08-21,train,2
340,7010,2000,1333,images/t397/yxCDpLfTms.JPG,2018-08-21 17:28:25,t397,2018-08-21,train,3
341,7011,2000,1333,images/t397/zcWhFFnpRd.JPG,2018-08-21 07:41:51,t397,2018-08-21,train,3


Unnamed: 0,id,width,height,file_name,timestamp,identity,date,split_closed,clarity
17,450,2000,1333,images/t023/CRedDLWUSY.JPG,2019-09-02 19:27:12,t023,2019-09-02,valid,1
18,451,2000,1333,images/t023/FjcFDshgIm.JPG,2019-09-02 19:20:56,t023,2019-09-02,valid,1
19,459,2000,1333,images/t023/RGgyZSkQxK.JPG,2019-09-02 19:25:37,t023,2019-09-02,valid,1
20,461,2000,1333,images/t023/ReJwfWAkuE.JPG,2019-09-02 19:20:51,t023,2019-09-02,valid,2
21,472,2000,1333,images/t023/YTCKpQVEGl.JPG,2019-09-02 19:20:15,t023,2019-09-02,valid,1
...,...,...,...,...,...,...,...,...,...
345,6988,2000,1333,images/t397/WCyihxtiwd.JPG,2018-08-23 17:24:31,t397,2018-08-23,valid,1
346,6999,2000,1333,images/t397/hwnqSdMpcv.JPG,2018-08-23 17:23:54,t397,2018-08-23,valid,3
347,7003,2000,1333,images/t397/lINqnmsmtW.JPG,2018-08-23 18:37:41,t397,2018-08-23,valid,1
348,7005,2000,1333,images/t397/mEgjzrpRIx.JPG,2018-08-23 18:37:25,t397,2018-08-23,valid,2


Unnamed: 0,id,width,height,file_name,timestamp,identity,date,split_closed,clarity
24,7806,2000,1333,images/t023/ZUrnxpWVWM.jpeg,2022-07-01 07:58:41,t023,2022-07-01,test,2
25,7807,2000,1333,images/t023/egRNXuAcCX.jpeg,2022-07-01 08:01:12,t023,2022-07-01,test,2
26,7808,2000,1333,images/t023/NnVQcUTsWH.jpeg,2022-07-01 08:06:04,t023,2022-07-01,test,2
27,7809,2000,1333,images/t023/rfBSPTnOvs.jpeg,2022-07-01 08:25:43,t023,2022-07-01,test,2
57,873,2000,1333,images/t040/EkAGjrIHPI.JPG,2015-07-11 14:38:52,t040,2015-07-11,test,1
...,...,...,...,...,...,...,...,...,...
354,6991,2000,1333,images/t397/bWLCSmphTm.JPG,2018-08-24 07:42:11,t397,2018-08-24,test,2
355,7000,2000,1333,images/t397/jgamYnCfkK.JPG,2018-08-24 07:47:41,t397,2018-08-24,test,1
356,7004,2000,1333,images/t397/lpsaLAHxYz.JPG,2018-08-24 07:46:16,t397,2018-08-24,test,3
357,7009,2000,1333,images/t397/tgiLiJvtSF.JPG,2018-08-24 07:47:42,t397,2018-08-24,test,1
