# Create Ground Truth Dataset
In this notebook we are able to create updated ground truth dataset, which is built from the updated annotated data.
We are also will split the data to train and test splits for further model fine tuning and evaluation.
Each time we want to create a new data for the updated annotations use this notebook.

In [3]:
import pandas as pd
GROUND_TRUTH_METADATA_FILE = "/home/etaylor/code_projects/thesis/data/metadata/ground_truth_metadata.csv"

### Option 1 - Load the ground truth dataset

In [4]:
# load the ground_truth_df from the csv file
ground_truth_df = pd.read_csv(GROUND_TRUTH_METADATA_FILE)
ground_truth_df.head()

Unnamed: 0,image_number,image_path,week_number,clear,cloudy,amber
0,IMG_2198,/sise/home/etaylor/images/raw_images/week9_15_...,week9_15_06_2023,48,148,10
1,IMG_2153,/sise/home/etaylor/images/raw_images/week9_15_...,week9_15_06_2023,38,120,28
2,IMG_2145,/sise/home/etaylor/images/raw_images/week9_15_...,week9_15_06_2023,51,110,61
3,IMG_2129,/sise/home/etaylor/images/raw_images/week9_15_...,week9_15_06_2023,75,216,9
4,IMG_1875,/sise/home/etaylor/images/raw_images/week8_07_...,week8_07_06_2023,7,77,9


### Option 2 - Calculate the Ground Truth Dataset

In [None]:
import pickle
import pandas as pd
import config

#load annotated data
with open('/home/etaylor/code_projects/thesis/metadata/images_annotated.pkl', 'rb') as f:
    users_annotated_data = pickle.load(f)
    
users_annotated_df = pd.DataFrame(users_annotated_data, columns=['image_number'])
users_annotated_df.head()

Unnamed: 0,image_number
0,IMG_2129
1,IMG_2198
2,IMG_0546
3,IMG_1079
4,IMG_0543


In [None]:

# Get the ground truth from the annotated metadata csv
annotation_tracking_df = pd.read_csv(config.ANNOTATIONS_TRACKING_METADATA_FILE)

# filter all records where done column is True
annotation_tracking_df = annotation_tracking_df[annotation_tracking_df.done == True]
print(f"Number of records in the annotation tracking metadata file: {len(annotation_tracking_df)}")

# convert Time column to datetime from strtime
annotation_tracking_df['Time'] = pd.to_datetime(annotation_tracking_df['Time'], format='%d-%m-%Y_%H-%M-%S')

# filter all records that the image_number are in users_annotated_df or the Time is past 2024
annotation_working_df = annotation_tracking_df[annotation_tracking_df.image_number.isin(users_annotated_df.image_number) | (annotation_tracking_df.Time > '2024-01-01')]
print(f"Number of records in the annotation tracking metadata file that are also annotated by users: {len(annotation_working_df)}")
annotation_working_df.head()

Number of records in the annotation tracking metadata file: 55
Number of records in the annotation tracking metadata file that are also annotated by users: 50


Unnamed: 0,image_number,annotator,Time,done
0,IMG_2198,dorins,2023-12-15 19:26:10,True
1,IMG_2153,dorins,2023-12-15 19:26:10,True
3,IMG_2145,Nirmalka,2023-12-15 19:26:10,True
4,IMG_2129,Nirmalka,2023-12-15 19:26:10,True
5,IMG_1875,dorins,2023-12-15 19:26:10,True


In [None]:
from src.annotation_handling.segmentsai_handler import SegmentsAIHandler
segments_handler = SegmentsAIHandler()

ground_truth_trichome_distribution = {}
# get the trichome distribution for each annotated image dataset
for image_number in annotation_working_df.image_number:
    print(f"get the distribution for image {image_number}")
    trichome_distribution = segments_handler.get_trichome_distribution(image_number)
    ground_truth_trichome_distribution[image_number] = trichome_distribution
    print(f"Distribution for image {image_number}: \n{trichome_distribution}")

INFO:segments.client:Initialized successfully.


get the distribution for image IMG_2198
Distribution for image IMG_2198: 
{'clear': 48, 'cloudy': 148, 'amber': 10}
get the distribution for image IMG_2153
Distribution for image IMG_2153: 
{'clear': 38, 'cloudy': 120, 'amber': 28}
get the distribution for image IMG_2145
Distribution for image IMG_2145: 
{'clear': 51, 'cloudy': 110, 'amber': 61}
get the distribution for image IMG_2129
Distribution for image IMG_2129: 
{'clear': 75, 'cloudy': 216, 'amber': 9}
get the distribution for image IMG_1875
Distribution for image IMG_1875: 
{'clear': 7, 'cloudy': 77, 'amber': 9}
get the distribution for image IMG_1787
Distribution for image IMG_1787: 
{'clear': 3, 'cloudy': 104, 'amber': 60}
get the distribution for image IMG_1857
Distribution for image IMG_1857: 
{'clear': 7, 'cloudy': 163, 'amber': 17}
get the distribution for image IMG_1753
Distribution for image IMG_1753: 
{'clear': 74, 'cloudy': 21, 'amber': 0}
get the distribution for image IMG_1818
Distribution for image IMG_1818: 
{'clea

In [None]:
# organize the ground truth to a df
images_annotations = []
# create a csv file for each image path and the trichome value
for image_number, image_dist in ground_truth_trichome_distribution.items():
    image_path = config.get_image_path(image_number)
    week, zoom_type = config.find_image_details(image_number)
    images_annotations.append({"image_number": image_number,
                                "image_path":image_path,
                                "week_number": week,
                                "clear": image_dist.get("clear", 0),
                                "cloudy": image_dist.get("cloudy", 0),
                                "amber": image_dist.get("amber", 0)})
ground_truth_df = pd.DataFrame(images_annotations)

# save the ground_truth_df to a csv file with timestamp
ground_truth_df.to_csv(GROUND_TRUTH_METADATA_FILE, index=False)

ground_truth_df.head()

Unnamed: 0,image_number,image_path,week_number,clear,cloudy,amber
0,IMG_2198,/sise/home/etaylor/images/raw_images/week9_15_...,week9_15_06_2023,48,148,10
1,IMG_2153,/sise/home/etaylor/images/raw_images/week9_15_...,week9_15_06_2023,38,120,28
2,IMG_2145,/sise/home/etaylor/images/raw_images/week9_15_...,week9_15_06_2023,51,110,61
3,IMG_2129,/sise/home/etaylor/images/raw_images/week9_15_...,week9_15_06_2023,75,216,9
4,IMG_1875,/sise/home/etaylor/images/raw_images/week8_07_...,week8_07_06_2023,7,77,9


### Ground Truth Data Stats

In [5]:
ground_truth_df['week_number'].value_counts()

week_number
week9_15_06_2023    13
week6_22_05_2023    11
week5_18_05_2023    10
week7_01_06_2023     9
week8_07_06_2023     7
Name: count, dtype: int64

## Split data to train and test
Prepare the data for all the models that each would be trained and evaluated on the same data. 

In [6]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
train_df, test_df = train_test_split(ground_truth_df, test_size=0.2, random_state=314977596)  # 20% data as test set


In [7]:
print(f"Number of records in the train set: {len(train_df)}")
print(f"Weeks distribution in the train set: \n{train_df['week_number'].value_counts()}")
train_df.head()

Number of records in the train set: 40
Weeks distribution in the train set: 
week_number
week6_22_05_2023    10
week9_15_06_2023    10
week5_18_05_2023     8
week8_07_06_2023     6
week7_01_06_2023     6
Name: count, dtype: int64


Unnamed: 0,image_number,image_path,week_number,clear,cloudy,amber
48,IMG_0597,/sise/home/etaylor/images/raw_images/week6_22_...,week6_22_05_2023,27,7,0
34,IMG_2163,/sise/home/etaylor/images/raw_images/week9_15_...,week9_15_06_2023,14,59,24
45,IMG_0572,/sise/home/etaylor/images/raw_images/week6_22_...,week6_22_05_2023,79,65,3
3,IMG_2129,/sise/home/etaylor/images/raw_images/week9_15_...,week9_15_06_2023,75,216,9
1,IMG_2153,/sise/home/etaylor/images/raw_images/week9_15_...,week9_15_06_2023,38,120,28


In [8]:
print(f"Number of records in the test set: {len(test_df)}")
print(f"Weeks distribution in the test set: \n{test_df['week_number'].value_counts()}")
test_df.head()

Number of records in the test set: 10
Weeks distribution in the test set: 
week_number
week9_15_06_2023    3
week7_01_06_2023    3
week5_18_05_2023    2
week6_22_05_2023    1
week8_07_06_2023    1
Name: count, dtype: int64


Unnamed: 0,image_number,image_path,week_number,clear,cloudy,amber
19,IMG_0542,/sise/home/etaylor/images/raw_images/week6_22_...,week6_22_05_2023,36,49,0
0,IMG_2198,/sise/home/etaylor/images/raw_images/week9_15_...,week9_15_06_2023,48,148,10
7,IMG_1753,/sise/home/etaylor/images/raw_images/week8_07_...,week8_07_06_2023,74,21,0
12,IMG_1093,/sise/home/etaylor/images/raw_images/week7_01_...,week7_01_06_2023,33,143,36
43,IMG_0019,/sise/home/etaylor/images/raw_images/week5_18_...,week5_18_05_2023,144,100,4


Save the train and test data

In [11]:
import config
# save the train data
train_df_path = f"/home/etaylor/code_projects/thesis/data/train_data_{config.get_datetime_str()}.csv"
train_df.to_csv(train_df_path, index=False)

# save the test data
test_df_path = f"/home/etaylor/code_projects/thesis/data/test_data_{config.get_datetime_str()}.csv"
test_df.to_csv(test_df_path, index=False)

### Create the new datasets in SegmentAI

In [None]:
from datetime import datetime

Create the new train and test datasets

In [None]:
zoom_type = '3x_regular'
# add datetime obj
current_date = datetime.now().strftime(config.DATETIME_STR_FORMAT)

train_image_numbers_str = ", ".join([str(image_number) for image_number in train_df['image_number'].values])

dataset_name_train = f"cannabis_patches_train_{current_date}"
description_train = (
    f"Train set for images {train_image_numbers_str}."
)
task_type = "segmentation-bitmap"

task_attributes = {
    "format_version": "0.1",
    "categories": [{"name": "trichome", "id": 0, "color": [65,117,5]},
                {"name": "clear", "id": 1, "color": [155,155,155]},
                {"name": "cloudy", "id": 2, "color": [255,255,255]},
                {"name": "amber", "id": 3, "color": [245,166,35]}]
}

# Create the train dataset:
train_dataset_instance = segments_handler.create_new_dataset(dataset_name_train, description_train, task_type, task_attributes)
print(train_dataset_instance)

test_image_numbers_str = ", ".join([str(image_number) for image_number in test_df['image_number'].values])

dataset_name_test = f"cannabis_patches_test_{current_date}"
description_test = (
    f"Test set for images {test_image_numbers_str}."
)

# Create the test dataset:
dataset_instance_test = segments_handler.create_new_dataset(dataset_name_test, description_test, task_type, task_attributes)
print(dataset_instance_test)

name='cannabis_patches_train_26-04-2024_15-44-44' full_name='etaylor/cannabis_patches_train_26-04-2024_15-44-44' cloned_from=None description='Train set for images IMG_0597, IMG_2163, IMG_0572, IMG_2129, IMG_2153, IMG_2235, IMG_1787, IMG_0014, IMG_0045, IMG_0612, IMG_0003, IMG_1144, IMG_2157, IMG_0048, IMG_1186, IMG_0017, IMG_0016, IMG_0543, IMG_2167, IMG_2276, IMG_0547, IMG_1857, IMG_0562, IMG_1082, IMG_2134, IMG_9998, IMG_0581, IMG_1085, IMG_1827, IMG_1182, IMG_2145, IMG_0540, IMG_0546, IMG_2242, IMG_0001, IMG_0545, IMG_1784, IMG_1875, IMG_1111, IMG_1818.' category='other' public=False owner=Owner(username='etaylor', created_at='2022-12-28T12:53:18Z', email=None) created_at='2024-04-26T12:44:45.218197Z' enable_ratings=False enable_skip_labeling=True enable_skip_reviewing=False enable_save_button=False enable_label_status_verified=False enable_same_dimensions_track_constraint=False enable_interpolation=True task_type='segmentation-bitmap' label_stats=LabelStats(REVIEWED=None, REVIEWIN

Add the images for the train and test

In [None]:
# add the images to the train dataset
train_dataset_identifier = f"etaylor/{dataset_name_train}"
for index, row in train_df.iterrows():
    image_number = row["image_number"]
    week, zoom_type = config.find_image_details(image_number)
    dataset_identifier = segments_handler.get_segments_dataset_identifier(image_number, week, zoom_type)
    segments_handler.copy_dataset_contents(dataset_identifier, train_dataset_identifier, verbose=True, only_patches=True)

Skipping raw image IMG_0597.JPG
Processing sample 2: IMG_0597_p0.png
  - Copied sample IMG_0597_p0.png and its label.
Processing sample 3: IMG_0597_p1.png
  - Copied sample IMG_0597_p1.png and its label.
Processing sample 4: IMG_0597_p2.png
  - Copied sample IMG_0597_p2.png and its label.
Processing sample 5: IMG_0597_p3.png
  - Copied sample IMG_0597_p3.png and its label.
Processing sample 6: IMG_0597_p4.png
  - Copied sample IMG_0597_p4.png and its label.
Processing sample 7: IMG_0597_p5.png
  - Copied sample IMG_0597_p5.png and its label.
Processing sample 8: IMG_0597_p6.png
  - Copied sample IMG_0597_p6.png and its label.
Skipping sample IMG_0597_p7.png due to label status mismatch
Processing sample 10: IMG_0597_p8.png
  - Copied sample IMG_0597_p8.png and its label.
Skipping sample IMG_0597_p9.png due to label status mismatch
Skipping raw image IMG_2163.JPG
Processing sample 2: IMG_2163_p0.png
  - Copied sample IMG_2163_p0.png and its label.
Processing sample 3: IMG_2163_p1.png
  

In [None]:
# add the images to the train dataset
test_dataset_identifier = f"etaylor/{dataset_name_test}"
for index, row in test_df.iterrows():
    image_number = row["image_number"]
    week, zoom_type = config.find_image_details(image_number)
    dataset_identifier = segments_handler.get_segments_dataset_identifier(image_number, week, zoom_type)
    segments_handler.copy_dataset_contents(dataset_identifier, test_dataset_identifier, verbose=True, only_patches=True)

Skipping raw image IMG_0542.JPG
Processing sample 2: IMG_0542_p0.png
  - Copied sample IMG_0542_p0.png and its label.
Processing sample 3: IMG_0542_p10.png
  - Copied sample IMG_0542_p10.png and its label.
Skipping sample IMG_0542_p11.png due to label status mismatch
Processing sample 5: IMG_0542_p12.png
  - Copied sample IMG_0542_p12.png and its label.
Skipping sample IMG_0542_p1.png due to label status mismatch
Processing sample 7: IMG_0542_p2.png
  - Copied sample IMG_0542_p2.png and its label.
Processing sample 8: IMG_0542_p3.png
  - Copied sample IMG_0542_p3.png and its label.
Skipping sample IMG_0542_p4.png due to label status mismatch
Processing sample 10: IMG_0542_p5.png
  - Copied sample IMG_0542_p5.png and its label.
Processing sample 11: IMG_0542_p6.png
  - Copied sample IMG_0542_p6.png and its label.
Processing sample 12: IMG_0542_p7.png
  - Copied sample IMG_0542_p7.png and its label.
Processing sample 13: IMG_0542_p8.png
  - Copied sample IMG_0542_p8.png and its label.
Sk