We implement a self-training version of semi-supervised learning due to 1200 labelled images and 5000+ unlabelled data.

## 1. Importing Libraries

In [None]:
# !pip install ultralytics==8.0.176 -q
# !pip install pycocotools -q

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
os.environ['WANDB_DISABLED'] = 'true'

import shutil
import json
import random
from tqdm import tqdm

from PIL import Image

from ultralytics import YOLO

## 2. Creating Directories

In [None]:
# Creating Directories

parent_dirpath = "/kaggle/working/yolov8"

os.mkdir(parent_dirpath)
os.mkdir("/kaggle/working/temp_images")
os.mkdir("/kaggle/working/temp_labels")

os.mkdir(os.path.join(parent_dirpath, "train"))
os.mkdir(os.path.join(parent_dirpath, "train", "images"))
os.mkdir(os.path.join(parent_dirpath, "train", "labels"))

os.mkdir(os.path.join(parent_dirpath, "test"))
os.mkdir(os.path.join(parent_dirpath, "test", "images"))
os.mkdir(os.path.join(parent_dirpath, "test", "labels"))


## 3. Defining Functions

In [None]:
def tiff_to_jpg(file_name):
    
    tiff_image_path = "/kaggle/input/hubmap-hacking-the-human-vasculature/train/" + str(file_name) + ".tif"
    tiff_image = Image.open(tiff_image_path)
    destination_path = "/kaggle/working/temp_images/" + file_name + ".jpg"
    tiff_image.save(destination_path, 'JPEG')
    
    return 0

def vertices_to_txt(file_id, annotations, list_of_vertices):
    
    file_contents = []

    for i in range(len(annotations)):

        yolo_format = []
        flag = 1

        if annotations[i]['type'] == 'glomerulus':
            yolo_format.append(str(1))
            flag = 1
        elif annotations[i]['type'] == 'blood_vessel':
            yolo_format.append(str(0))
            flag = 1
        else:
            flag = 0


        if (flag):

            list_of_vertices = annotations[i]['coordinates'][0]
            for vertex in list_of_vertices:
                yolo_format.append(str(vertex[0]/512))
                yolo_format.append(str(vertex[1]/512))

        yolo_format = " ".join(yolo_format)

        file_contents.append(yolo_format)

    file_name = "/kaggle/working/temp_labels/" + str(file_id) + ".txt"

    with open(file_name, "w") as file:
        if (len(file_contents) == 0):
            pass
        elif (len(file_contents) == 1):
            file.write(str(file_contents[-1]))
        else:
            for k in range(len(file_contents)-1):
                file.write(str(file_contents[k]) + "\n")

            file.write(str(file_contents[-1]))
            
    return 0

## 4. Data Processing

In [None]:
train_filepath = "/kaggle/input/hubmap-hacking-the-human-vasculature/train"

all_images = os.listdir(train_filepath)
print("No. of images:", len(all_images))

In [None]:
json_filepath = "/kaggle/input/hubmap-hacking-the-human-vasculature/polygons.jsonl"
file_ids = []

with open(json_filepath, 'r') as file:
    
    for line in file:
        data = json.loads(line)
        file_id = data['id']
        annotations = data['annotations']
        list_of_vertices = annotations[0]['coordinates'][0]
        tiff_to_jpg(file_id)
        vertices_to_txt(file_id, annotations, list_of_vertices)
        file_ids.append(file_id)

In [None]:
all_file_ids = []

for file_name in all_images:
    file_name = file_name.split('.')
    all_file_ids.append(file_name[0])

unlabelled_images = list(set(all_file_ids).difference(file_ids))

In [None]:
random.shuffle(file_ids)

# Train directory
for i in range(0, int(0.8*len(file_ids))):
    
    old_path_img = "/kaggle/working/temp_images/" + str(file_ids[i]) + ".jpg"
    new_path_img = "/kaggle/working/yolov8/train/images/" + str(file_ids[i]) + ".jpg"
    shutil.copy(old_path_img, new_path_img)
    
    old_path_txt = "/kaggle/working/temp_labels/" + str(file_ids[i]) + ".txt"
    new_path_txt = "/kaggle/working/yolov8/train/labels/" + str(file_ids[i]) + ".txt"
    shutil.copy(old_path_txt, new_path_txt)

# Test directory
for i in range(int(0.8*len(file_ids)), len(file_ids)):
    
    old_path_img = "/kaggle/working/temp_images/" + str(file_ids[i]) + ".jpg"
    new_path_img = "/kaggle/working/yolov8/test/images/" + str(file_ids[i]) + ".jpg"
    shutil.copy(old_path_img, new_path_img)
    
    old_path_txt = "/kaggle/working/temp_labels/" + str(file_ids[i]) + ".txt"
    new_path_txt = "/kaggle/working/yolov8/test/labels/" + str(file_ids[i]) + ".txt"
    shutil.copy(old_path_txt, new_path_txt)
    
shutil.rmtree("/kaggle/working/temp_images")
shutil.rmtree("/kaggle/working/temp_labels")

In [None]:
# Creating custom_config.yaml file
with open("/kaggle/working/custom_config.yaml", "w") as file:
    file.write("path: /kaggle/working/yolov8" + "\n")
    file.write("train: train/images" + "\n")
    file.write("val: test/images" + "\n")
    file.write("test: test/images" + "\n")
    file.write("nc: 2" + "\n")
    file.write("names: ['blood_vessel','glomerulus']")

In [None]:
test_set_file_ids = []
test_set_filepath = "/kaggle/working/yolov8/test/images"
test_set_images_filepaths = os.listdir(test_set_filepath)

for file_name in test_set_images_filepaths:
    file_name = file_name.split('.')
    test_set_file_ids.append(file_name[0])
        
len(test_set_file_ids)

## 5. Model training for 10 iterations

In [None]:
# Get all file_ids from test set.
test_set_file_ids = []
test_set_filepath = "/kaggle/working/yolov8/test/images"
test_set_images_filepaths = os.listdir(test_set_filepath)

for file_name in test_set_images_filepaths:
    file_name = file_name.split('.')
    test_set_file_ids.append(file_name[0])
    
# Model training
model = YOLO('yolov8x-seg.pt')

for iteration in range(10):

    results = model.train(data='/kaggle/working/custom_config.yaml',
                          epochs=15, imgsz=512, optimizer='Adam',
                          seed=42, close_mosaic=0, mask_ratio=1, val=True,
                          degrees=90, translate=0.1, scale=0.5, flipud=0.5, fliplr=0.5)

    images_added = 0
    
    # 1. Get all file_ids from train_set
    train_set_file_ids = []
    train_set_filepath = "/kaggle/working/yolov8/train/images"
    train_set_images_filepaths = os.listdir(train_set_filepath)

    for file_name in train_set_images_filepaths:
        file_name = file_name.split('.')
        train_set_file_ids.append(file_name[0])
        
    # 2. Add file_ids from test_set
    all_labelled_image_file_ids = train_set_file_ids + test_set_file_ids
    
    # 3. Find differences from inputs and combined list & create new unlabelled image file_id list.
    unlabelled_images = list(set(all_file_ids).difference(all_labelled_image_file_ids))

    for file_id in tqdm(unlabelled_images):

        tiff_image_path = "/kaggle/input/hubmap-hacking-the-human-vasculature/train/" + str(file_id) + ".tif"
        tiff_image = Image.open(tiff_image_path)
        destination_path = "/kaggle/working/temp_image.jpg"
        tiff_image.save(destination_path, 'JPEG')

        results = model.predict(destination_path, verbose=False)

        flag = 1
        file_contents = []

        for result in results:
            boxes = result.boxes.conf
            if len(boxes) != 0:
                classes = result.boxes.cls
                masks = result.masks.xyn
            else:
                flag = 0

        if (flag):
            for i in range(len(boxes)):
                if boxes[i] < 0.4:
                    flag=0
                    break

        if(flag):
            des_img_filepath = os.path.join("/kaggle/working/yolov8/train/images/" + str(file_id) + ".jpg")
            shutil.copy(destination_path, des_img_filepath)

            for i in range(len(boxes)):

                yolo_format = []

                if classes[i] == 1:
                    yolo_format.append(str(1))
                else:
                    yolo_format.append(str(0))

                list_of_vertices = masks[i]
                for vertex in list_of_vertices:
                    yolo_format.append(str(vertex[0]))
                    yolo_format.append(str(vertex[1]))

                yolo_format = " ".join(yolo_format)

                file_contents.append(yolo_format)

            file_name = os.path.join("/kaggle/working/yolov8/train/labels/" + str(file_id) + ".txt")

            with open(file_name, "w") as file:
                if (len(file_contents) == 1):
                    file.write(str(file_contents[-1]))
                else:
                    for k in range(len(file_contents)-1):
                        file.write(str(file_contents[k]) + "\n")

                    file.write(str(file_contents[-1]))

            images_added += 1

        flag = 1

    print("Images added to training set:", images_added)    

In [None]:
print("Final length of Train dataset:", len(list(os.listdir("/kaggle/working/yolov8/train/labels"))))

In [None]:
results = model.predict("/kaggle/working/temp_image.jpg")

for result in results:
    boxes = result.boxes.conf # confidence scores
    classes = result.boxes.cls # class in float
    masks = result.masks.xyn # location of each segment, normalised

## 6. Final model training

In [None]:
model = YOLO('yolov8x-seg.pt')
results = model.train(data='/kaggle/working/custom_config.yaml',
                      epochs=50, imgsz=512, optimizer='Adam',
                      seed=42, close_mosaic=0, mask_ratio=1, val=True,
                      degrees=90, translate=0.1, scale=0.5, flipud=0.5, fliplr=0.5)