# Tests on YoloV8 for fAIr
Using model V2 from Omdena results.
Mocking `test_yolo_v2.py`

## Data import and variables definition

In [29]:
# Standard library imports
import os
import time
import warnings
import ultralytics
import yaml
import csv
import pandas as pd

os.environ.update(os.environ)
os.environ["RAMP_HOME"] = os.getcwd()

In [12]:
# Reader imports
from hot_fair_utilities import polygonize, predict, preprocess
from hot_fair_utilities.preprocessing.yolo_v8_v2.yolo_format_anna import yolo_format
from hot_fair_utilities.training.yolo_v8_v2 import train as train_yolo

warnings.simplefilter(action="ignore", category=FutureWarning)


In [69]:
# Define basic variables
# base_path = f"{os.getcwd()}/ramp-data/sample_2"
base_path = '/Users/azanchetta/fAIr_metric'
data_path = f'{base_path}/training_results'
preprocessed_ramp_data_path=f'{base_path}/metric_data'
k_data_path = f'{base_path}/anna-dataset' # this has been added for dealing with Kshiitj's data
#  Obtain cities list folders name in data folder
# cities_list = ['modelfake', 'model149_td489'] # sample of names, for tests
cities_list= [ item for item in os.listdir(preprocessed_ramp_data_path) if os.path.isdir(os.path.join(preprocessed_ramp_data_path, item)) ]
datasets_list = cities_list= [ item for item in os.listdir(k_data_path) if os.path.isdir(os.path.join(k_data_path, item)) ]  # this has been added for dealing with Kshiitj's data

In [None]:
print(len(cities_list))
for i in cities_list:
    print(i)

In [14]:

class print_time:
    def __init__(self, name):
        self.name = name

    def __enter__(self):
        self.start = time.perf_counter()
        return self

    def __exit__(self, type, value, traceback):
        print(f"{self.name} took {round(time.perf_counter() - self.start, 2)} seconds")

start_time = time.time()

---

ONLY RUN THE CELL BELOW ONCE

## Generate Yolo format input files
Note: need to re-run the preprocessing, can't use the preprocessed Ramp data

We have a problem with the data, as in the only data I have is the ramp preprocessed one. I can't run the yolo pre-processing on my own, so Kshiitj sent preprocessed data (he done that in the backend).

`anna-dataset` has this structure:
```model name
    |
     - preprocessed
        |
         - binarymasks/
         - chips/
         - inputs/
         - labels/
     - yolo_v1 
        |
         - images/
         - labels/
         - yolo_dataset.yaml
```

I need to restructure this to be consistent with my folder structure... or could just decide to use it like it is :P

**IMPORTANT** I need to rerun the preprocessing anyways, because the division train/val/test must be the same as ramp

--- renaming the folder `yolo_v2_dataset` 

In [None]:
#  importing csv file, for LUT with models / training dataset / dataset codes
lut_csvfile = f'{base_path}/cities_lut.csv'
# with open(lut_csvfile) as csv_file:
#     lut = csv.reader(csv_file)

lut = pd.read_csv(lut_csvfile,
                  )

In [32]:
lut.head()

Unnamed: 0,id,id_model,id_train,ds_size,urban_region,country,continent,id_dataset,urban_type,density,roof_type
0,1,51,364,399,Kakuma,Kenya,Africa,58,refugee camp,sparse,metal
1,2,95,370,168,Denver,USA,America North,135,peri-urban,grid,shingles
2,3,97,372,420,Montevideo,Uruguay,America South,137,urban,grid,cement
3,4,98,373,399,Montevideo dense,Uruguay,America South,138,urban,dense,cement
4,5,102,391,231,Kutupalong,Bangladesh,Asia,144,refugee camp,dense,mixed


In [None]:
print(lut.dtypes)

In [None]:
lut = lut.astype(str) # convert them all to string, for later

In [68]:
datasets_list

['dataset_489']

In [72]:
for dataset in datasets_list:
    dataset_name = dataset.split("_")[1]
    print(f'dataset {dataset} and dataset_name {dataset_name}')
    model_name = lut.loc[lut['id_dataset'] == dataset_name, 'id_model'].values[0] # without the values bit, you get a whole piece of dataframe
    td_name = lut.loc[lut['id_dataset'] == dataset_name, 'id_train'].values[0]
    print(f'model {model_name}, td {td_name}, dataset {dataset_name}')

dataset dataset_205 and dataset_name 205
model 149, td 489, dataset 205


In [71]:
# cities_list = ['modelfake', 'model149_td489'] # sample of names, for tests
datasets_list = ['dataset_205']

In [76]:
# Looping through the cities list, and check per each folder that tiles number is consistent, and the shapes too
# for city in cities_list:
for dataset in datasets_list:
    dataset_name = dataset.split("_")[1]

    model_name = lut.loc[lut['id_dataset'] == dataset_name, 'id_model'].values[0] # without the values bit, you get a whole piece of dataframe
    td_name = lut.loc[lut['id_dataset'] == dataset_name, 'id_train'].values[0]
    print(f'_________\nDataset {dataset}, model {model_name}, training dataset {td_name}\n')
    
    city = f'model{model_name}_td{td_name}'
    # city_folder_name=f'{base_path}/metric_data/{city}'
    
    csv_file_basepath = f'{data_path}/{city}/train'
    print(f'\n---\nNow working on {city}\n---')
    # print(f'city folder name is {city_folder_name}\n\n---\n')

    # model_input_image_path = f"{base_path}/input"
    # preprocess_output = f"{base_path}/preprocessed"
    # with print_time("preprocessing"):
    #     preprocess(
    #         input_path=model_input_image_path,
    #         output_path=preprocess_output,
    #         rasterize=True,
    #         rasterize_options=["binary"],
    #         georeference_images=True,
    #         multimasks=False,
    #         epsg=4326
    #     )

    # city_data_dir = f'{base_path}/training_results/{city}/train'
    # city_data_dir = f'{city_folder_name}'
    dataset_foldername = f'dataset_{dataset_name}'
    city_data_dir = f'{k_data_path}/{dataset_foldername}/preprocessed' # name as it appears in kshitij's folder
    
    yolo_data_dir = f'{base_path}/yolo_v2_preprocessed' # name for the output
    print(f'city is {city}')
    with print_time("yolo conversion"):
        print(f'\n___ Starting yolo files conversion\n')
        yolo_format(
            input_path=city_data_dir,
            csv_path=csv_file_basepath,
            output_path=yolo_data_dir,
            city_name=city
        )


_________
Dataset dataset_205, model 149, training dataset 489


---
Now working on model149_td489
---
city is model149_td489

___ Starting yolo files conversion


---
data_dirs are /Users/azanchetta/fAIr_metric/anna-dataset/dataset_205/preprocessed
CSV path is /Users/azanchetta/fAIr_metric/training_results/model149_td489/train/fair_split_train.csv
this is the list from the csv file:
[['/home/annazan/fAIr-utilities/ramp-data/metric_data/model149_td489/train/chips/OAM-1238662-1047076-21.tif'], ['/home/annazan/fAIr-utilities/ramp-data/metric_data/model149_td489/train/chips/OAM-1238662-1047078-21.tif'], ['/home/annazan/fAIr-utilities/ramp-data/metric_data/model149_td489/train/chips/OAM-309672-261767-19.tif'], ['/home/annazan/fAIr-utilities/ramp-data/metric_data/model149_td489/train/chips/OAM-1238666-1047079-21.tif'], ['/home/annazan/fAIr-utilities/ramp-data/metric_data/model149_td489/train/chips/OAM-1238689-1047089-21.tif'], ['/home/annazan/fAIr-utilities/ramp-data/metric_data/model149_td

100%|██████████| 103/103 [00:02<00:00, 37.77it/s]


Generating validation labels


100%|██████████| 22/22 [00:00<00:00, 53.11it/s]


Generating test labels


100%|██████████| 22/22 [00:00<00:00, 47.52it/s]


Generating training images


100%|██████████| 103/103 [00:00<00:00, 225.92it/s]


Generating validation images


100%|██████████| 22/22 [00:00<00:00, 222.27it/s]


Generating test images


100%|██████████| 22/22 [00:00<00:00, 234.78it/s]

yolo conversion took 4.3 seconds





---

## Training

In [None]:
# cities_list = ['modelfake', 'model149_td489'] # sample of names, for tests
cities_list = ['modelfake'] #  ['model149_td489'] sample of names, for tests
# cities_list= [ item for item in os.listdir(preprocessed_data_path) if os.path.isdir(os.path.join(preprocessed_data_path, item)) ]

In [25]:
# Training
yolo_output_path = f'{base_path}/yolo_v2_training'
yolo_data_dir = f'{base_path}/yolo_v2'
with print_time("yolo training"):
    for city in cities_list:
        city_yolodata_path = f'{base_path}/yolo_v2_data/{city}'
        city_output_path = f'{yolo_output_path}/{city}'
        # deal with the yaml file:
        basic_yaml_file_name_with_path = '/Users/azanchetta/fAIr-utilities/ramp-data/sample_2/yolo_v2/yolo_dataset.yaml'
        yaml_file_path_for_city = f'{city_yolodata_path}/yolo_dataset.yaml'
        # print(f'name that will used to create a new yaml file for the city: {yaml_file_path_for_city}')
        info_to_write_in_yaml = city_yolodata_path

        with open(basic_yaml_file_name_with_path, 'r') as file:
            yamlfile = yaml.safe_load(file)
        yamlfile['path'] = city_yolodata_path
        with open(yaml_file_path_for_city, 'w') as file:  # Save the updated YAML file
            yaml.dump(yamlfile, file)

        # Run the training
        output_model_path,output_model_iou_accuracy = train_yolo(
            data=city_yolodata_path, #f"{base_path}",
            weights=f"{os.getcwd()}/yolov8s_v2-seg.pt", 
            # gpu="cpu",
            epochs=3,
            batch_size=16,
            pc=2.0,
            output_path=city_output_path,
            dataset_yaml_path=yaml_file_path_for_city #'/Users/azanchetta/fAIr_metric/yolo_v2_data/model51_td364/yolo_dataset.yaml'
            # dataset_yaml_path='/Users/azanchetta/fAIr-utilities/ramp-data/sample_2/yolo_v2/yolo_dataset.yaml' ## this name is just a placeholder, we overwrite the variables in the code
        )
        print(output_model_iou_accuracy)

        # output_model_path,output_model_iou_accuracy = train_yolo(
        #     data=f"{base_path}",
        #     weights=f"{os.getcwd()}/yolov8s_v2-seg.pt", 
        #     # gpu="cpu",
        #     epochs=2,
        #     batch_size=16,
        #     pc=2.0,
        #     output_path=yolo_data_dir,
        #     dataset_yaml_path=os.path.join(yolo_data_dir,'yolo_dataset.yaml')
        # )
        # print(output_model_iou_accuracy)

Backbone: s, Dataset: yolo_v2_data, Epochs: 3
New https://pypi.org/project/ultralytics/8.3.40 available 😃 Update with 'pip install -U ultralytics'
Ultralytics 8.3.26 🚀 Python-3.12.4 torch-2.2.2 CPU (Intel Core(TM) i9-9980HK 2.40GHz)
[34m[1mengine/trainer: [0mtask=segment, mode=train, model=/Users/azanchetta/fAIr-utilities/yolov8s_v2-seg.pt, data=/Users/azanchetta/fAIr_metric/yolo_v2_data/model149_td489/yolo_dataset.yaml, epochs=3, time=None, patience=100, batch=16, imgsz=256, save=True, save_period=-1, cache=True, device=cpu, workers=8, project=/Users/azanchetta/fAIr_metric/yolo_v2_predictions/model149_td489/checkpoints, name=yolov8s-seg_yolo_v2_data_ep3_bs16_pc2.0, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=False, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=False, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False,

[34m[1mtrain: [0mScanning /Users/azanchetta/fAIr_metric/yolo_v2_data/model149_td489/labels/train... 103 images, 2 backgrounds, 0 corrupt: 100%|██████████| 103/103 [00:00<00:00, 765.94it/s]

[34m[1mtrain: [0mNew cache created: /Users/azanchetta/fAIr_metric/yolo_v2_data/model149_td489/labels/train.cache



[34m[1mtrain: [0mCaching images (0.0GB RAM): 100%|██████████| 103/103 [00:00<00:00, 1499.29it/s]
[34m[1mval: [0mScanning /Users/azanchetta/fAIr_metric/yolo_v2_data/model149_td489/labels/val... 22 images, 5 backgrounds, 0 corrupt: 100%|██████████| 22/22 [00:00<00:00, 774.10it/s]

[34m[1mval: [0mNew cache created: /Users/azanchetta/fAIr_metric/yolo_v2_data/model149_td489/labels/val.cache



[34m[1mval: [0mCaching images (0.0GB RAM): 100%|██████████| 22/22 [00:00<00:00, 1906.74it/s]


Plotting labels to /Users/azanchetta/fAIr_metric/yolo_v2_predictions/model149_td489/checkpoints/yolov8s-seg_yolo_v2_data_ep3_bs16_pc2.0/labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.00854' and 'momentum=0.95275' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.002, momentum=0.9) with parameter groups 66 weight(decay=0.0), 77 weight(decay=0.00058), 76 bias(decay=0.0)
[34m[1mTensorBoard: [0mmodel graph visualization added ✅
Image sizes 256 train, 256 val
Using 0 dataloader workers
Logging results to [1m/Users/azanchetta/fAIr_metric/yolo_v2_predictions/model149_td489/checkpoints/yolov8s-seg_yolo_v2_data_ep3_bs16_pc2.0[0m
Starting training for 3 epochs...

      Epoch    GPU_mem   box_loss   seg_loss   cls_loss   dfl_loss  Instances       Size


        1/3         0G          0          0      856.5          0          0        256: 100%|██████████| 7/7 [00:14<00:00,  2.02s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Mask(P          R      mAP50  mAP50-95): 100%|██████████| 1/1 [00:01<00:00,  1.26s/it]

                   all         22         17          0          0          0          0          0          0          0          0






      Epoch    GPU_mem   box_loss   seg_loss   cls_loss   dfl_loss  Instances       Size


        2/3         0G          0          0      255.4          0          0        256: 100%|██████████| 7/7 [00:13<00:00,  1.92s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Mask(P          R      mAP50  mAP50-95): 100%|██████████| 1/1 [00:01<00:00,  1.25s/it]

                   all         22         17          0          0          0          0          0          0          0          0






      Epoch    GPU_mem   box_loss   seg_loss   cls_loss   dfl_loss  Instances       Size


        3/3         0G          0          0      24.44          0          0        256: 100%|██████████| 7/7 [00:26<00:00,  3.73s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Mask(P          R      mAP50  mAP50-95): 100%|██████████| 1/1 [00:12<00:00, 12.30s/it]

                   all         22         17          0          0          0          0          0          0          0          0






3 epochs completed in 0.024 hours.
Optimizer stripped from /Users/azanchetta/fAIr_metric/yolo_v2_predictions/model149_td489/checkpoints/yolov8s-seg_yolo_v2_data_ep3_bs16_pc2.0/weights/last.pt, 23.8MB
Optimizer stripped from /Users/azanchetta/fAIr_metric/yolo_v2_predictions/model149_td489/checkpoints/yolov8s-seg_yolo_v2_data_ep3_bs16_pc2.0/weights/best.pt, 23.8MB

Validating /Users/azanchetta/fAIr_metric/yolo_v2_predictions/model149_td489/checkpoints/yolov8s-seg_yolo_v2_data_ep3_bs16_pc2.0/weights/best.pt...
Ultralytics 8.3.26 🚀 Python-3.12.4 torch-2.2.2 CPU (Intel Core(TM) i9-9980HK 2.40GHz)
YOLOv8s-seg summary (fused): 195 layers, 11,779,987 parameters, 0 gradients, 42.4 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Mask(P          R      mAP50  mAP50-95): 100%|██████████| 1/1 [00:06<00:00,  6.61s/it]

                   all         22         17          0          0          0          0          0          0          0          0





Speed: 1.0ms preprocess, 282.2ms inference, 0.0ms loss, 2.2ms postprocess per image
Results saved to [1m/Users/azanchetta/fAIr_metric/yolo_v2_predictions/model149_td489/checkpoints/yolov8s-seg_yolo_v2_data_ep3_bs16_pc2.0[0m
Ultralytics 8.3.26 🚀 Python-3.12.4 torch-2.2.2 CPU (Intel Core(TM) i9-9980HK 2.40GHz)
YOLOv8s-seg summary (fused): 195 layers, 11,779,987 parameters, 0 gradients, 42.4 GFLOPs


[34m[1mval: [0mScanning /Users/azanchetta/fAIr_metric/yolo_v2_data/model149_td489/labels/val.cache... 22 images, 5 backgrounds, 0 corrupt: 100%|██████████| 22/22 [00:00<?, ?it/s]




                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Mask(P          R      mAP50  mAP50-95): 100%|██████████| 2/2 [00:03<00:00,  1.76s/it]

                   all         22         17          0          0          0          0          0          0          0          0





Speed: 0.4ms preprocess, 136.7ms inference, 0.0ms loss, 1.0ms postprocess per image
Results saved to [1m/Users/azanchetta/fAIr-utilities/runs/segment/val6[0m
yolo training took 119.74 seconds


ZeroDivisionError: float division by zero

In [None]:
# Training
yolo_output_path = f'{base_path}/yolo_v2_predictions'
yolo_data_dir = f'{base_path}/yolo_v2'
with print_time("yolo training"):
    for city in cities_list:
        city_yolodata_path = f'{base_path}/yolo_v2_data/{city}'
        print(f'city path: {city_yolodata_path}')
        output_model_path,output_model_iou_accuracy = train_yolo(
            data=city_yolodata_path, #f"{base_path}",
            weights=f"{os.getcwd()}/yolov8s_v2-seg.pt", 
            # gpu="cpu",
            epochs=2,
            batch_size=16,
            pc=2.0,
            output_path=yolo_output_path,
            dataset_yaml_path='/Users/azanchetta/fAIr-utilities/ramp-data/sample_2/yolo_v2/yolo_dataset.yaml' ## this name is just a placeholder, we overwrite the variables in the code
        )
        print(output_model_iou_accuracy)

        # output_model_path,output_model_iou_accuracy = train_yolo(
        #     data=f"{base_path}",
        #     weights=f"{os.getcwd()}/yolov8s_v2-seg.pt", 
        #     # gpu="cpu",
        #     epochs=2,
        #     batch_size=16,
        #     pc=2.0,
        #     output_path=yolo_data_dir,
        #     dataset_yaml_path=os.path.join(yolo_data_dir,'yolo_dataset.yaml')
        # )
        # print(output_model_iou_accuracy)

## Prediction

In [None]:
# Prediction
prediction_output = f"{base_path}/prediction/output"
# model_path = f"{output_path}/weights/best.pt"
with print_time("inference"):
    predict(
        checkpoint_path=output_model_path,
        input_path=f"{base_path}/prediction/input",
        prediction_path=prediction_output,
    )

geojson_output = f"{prediction_output}/prediction.geojson"
with print_time("polygonization"):
    polygonize(
        input_path=prediction_output,
        output_path=geojson_output,
        remove_inputs=False,
    )

print(f"\n Total Process Completed in : {time.time()-start_time} sec")

In [None]:
# # Deal with the csv files with list of train/val/pred images used in RAMP
# #  testing function ... this goes inside `fined_files` in yolo_format_anna.py
# city="model51_td364"
# city_folder_name=f'{data_path}/{city}/train'
# csv_file_name = f'fair_split_train.csv'
# csv_file_path = f'{city_folder_name}/{csv_file_name}'
# print(f'CSV file is {csv_file_name}')
# print(f'CSV file is {csv_file_path}')
# csv_raw_list = []

# with open(csv_file_path, "r") as file_obj:
#     heading = next(file_obj)
#     reader_obj = csv.reader(file_obj, delimiter="\t")
#     for row in reader_obj:
#         csv_raw_list.append(row)
# print(f'this is the list from the csv file:\n{csv_raw_list}')
# csv_nested_list = []
# for ccc in csv_raw_list:
#     # print(ccc)
#     nested = ccc[0]
#     # print(f'nested {nested}')
#     name_csv = nested.split('/')[-1]
#     csv_nested_list.append(name_csv)

# # filenames_from_csv = [csvi.split("/",1)[-1] for csvi in csv_nested_list] # this is to get the last element of the string (i.e. the file name)
# # print(f'filenames hopefully {filenames_from_csv}')
# print(f'is this the names? {csv_nested_list}')