In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd drive/MyDrive

/content/drive/MyDrive


In [3]:
!mkdir plantdoc
%cd plantdoc

/content/drive/MyDrive/plantdoc


## 下載檔案

In [4]:
! curl -L "https://public.roboflow.com/ds/yWxXqZujl9?key=4ohhBjR5iC" > plantdoc.zip; unzip -o plantdoc.zip; rm plantdoc.zip

[1;30;43m串流輸出內容已截斷至最後 5000 行。[0m
 extracting: test/images/20180511_091252-1gy5xf5-e1526048000596_jpg.rf.8cec0a17dc1fc52e2557c03003d8732f.jpg  
 extracting: test/images/peach-leaf-10223898_jpg.rf.8cf85585c72ba2ee682c791d88a311f6.jpg  
 extracting: test/images/strawberry-leaves-stock-picture-948708_jpg.rf.8ba8314b55217861efaaf24580fc3d30.jpg  
 extracting: test/images/02c_jpg.rf.8ccc7cfab5597c03b0c2f83f7f0f1df3.jpg  
 extracting: test/images/backus-056-potato-blight_jpg.rf.8b1c7d3597fbc4c1886d70b5880cb8f5.jpg  
 extracting: test/images/GREEN%20BELL%20PLANT%20YELLOW%20LEAF_JPG_jpg.rf.8e881f6dbafd571a3fd56d572d3f3be1.jpg  
 extracting: test/images/1421_0_jpeg?itok=FMtmgePj_jpg.rf.90c5eeac8b692e14f44e820966da13b1.jpg  
 extracting: test/images/potato-early-blight-alternaria-alternata-lesion-on-a-potato-leaf-a1w1em_jpg.rf.8ee5cc79c23bb4dfd8cd6e4fd489a432.jpg  
 extracting: test/images/dscn6689_jpg.rf.904696395f42c775453f970be794377d.jpg  
 extracting: test/images/1b321015-6e33-4f18-aade-88

## 劃分訓練/驗證集

In [5]:
import os
import cv2
import math
import json
import shutil

In [6]:
# 是否要重新命名 dataset 資料夾
rename_data_name = True
train_data_file = 'train'

if rename_data_name == True:
    org_img_path = os.path.join(train_data_file, 'org_images')
    img_path = os.path.join(train_data_file, 'images')
    org_label_path = os.path.join(train_data_file, 'org_labels')
    label_path = os.path.join(train_data_file, 'labels')
    
    if os.path.exists(img_path) and not os.path.exists(org_img_path):
        shutil.move(img_path, org_img_path)
        os.makedirs(img_path)
    
    if os.path.exists(label_path) and not os.path.exists(org_label_path):
        shutil.move(label_path, org_label_path)
        os.makedirs(label_path)

    img_cnt = 0
    for img_file in os.listdir(org_img_path):
        if img_file.endswith('.jpg') or img_file.endswith('.txt'): 
            label_file = img_file.replace('.jpg','.txt').replace('.png','.txt')
            
            with open(os.path.join(org_label_path, label_file), 'r') as f:
                label_content = f.readlines()
                
                if label_content != []:
                    new_img_file = "data_" + str(img_cnt)
                    shutil.copy(os.path.join(org_img_path, img_file), os.path.join(img_path, new_img_file + '.jpg'))
                    shutil.copy(os.path.join(org_label_path, label_file), os.path.join(label_path, new_img_file + '.txt'))
                    img_cnt += 1
                
        else:
            print(img_file)
        

In [7]:
total_data = os.listdir(os.path.join(train_data_file, "images"))

split_rate = 0.8

train_data = total_data[:math.floor(len(total_data)*split_rate)]
valid_data = total_data[math.floor(len(total_data)*split_rate):]

print("train data length:", len(train_data))
print("valid data length:", len(valid_data))
print("total data length:", len(total_data))

train data length: 1854
valid data length: 464
total data length: 2318


## 產生 train.txt 和 valid.txt

In [8]:
write_train_txt = os.path.join(train_data_file, 'train.txt')
write_valid_txt = os.path.join(train_data_file, 'valid.txt')

train_data_list = []
with open(write_train_txt, 'w') as f:
    for i in train_data:
        train_data_path = os.path.join("./images", i)
        f.write('%s\n' % train_data_path)
        train_data_list.append(i)

valid_data_list = []
with open(write_valid_txt, 'w') as f:
    for i in valid_data:
        valid_data_path = os.path.join("./images", i)
        f.write('%s\n' % valid_data_path)
        valid_data_list.append(i)

## 產生classes.txt

In [9]:
class_names =  ['Apple Scab Leaf', 'Apple leaf', 'Apple rust leaf', 'Bell_pepper leaf spot', 'Bell_pepper leaf', 'Blueberry leaf', 'Cherry leaf', 'Corn Gray leaf spot', 'Corn leaf blight', 'Corn rust leaf', 'Peach leaf', 'Potato leaf early blight', 'Potato leaf late blight', 'Potato leaf', 'Raspberry leaf', 'Soyabean leaf', 'Soybean leaf', 'Squash Powdery mildew leaf', 'Strawberry leaf', 'Tomato Early blight leaf', 'Tomato Septoria leaf spot', 'Tomato leaf bacterial spot', 'Tomato leaf late blight', 'Tomato leaf mosaic virus', 'Tomato leaf yellow virus', 'Tomato leaf', 'Tomato mold leaf', 'Tomato two spotted spider mites leaf', 'grape leaf black rot', 'grape leaf']

In [10]:
classes_txt = os.path.join(train_data_file, 'classes.txt')
class_names = [i.replace(' ', '_') for i in class_names]

with open(classes_txt, 'w') as f:
    f.write('\n'.join(class_names))

## coco json 初始化

In [11]:
super_category = 'leaf'

dataset_info = {'info': {'description': '', 'url': '', 'version': '1.0',
                'year': 2022, 'contributor': 'Joy', 'date_created': ''}, 
                'images': [], 'annotations': [], 
                'categories': [{'supercategory': super_category, 'id': i+1, 'name': cls} for i, cls in enumerate(class_names)]}
train_dataset = {}
valid_dataset = {}

train_dataset.update(dataset_info)
valid_dataset.update(dataset_info)

## coco json 內容的 'images', 'annotations'

In [12]:
def yolo2coco(box_info, img_size):
    x_min = (box_info[0] - box_info[2]/2) * img_size[1]
    x_max = (box_info[0] + box_info[2]/2) * img_size[1]
    y_min = (box_info[1] - box_info[3]/2) * img_size[0]
    y_max = (box_info[1] + box_info[3]/2) * img_size[0]
    
    return x_min, x_max, y_min, y_max

In [13]:
images_dir = os.path.join(train_data_file, 'images')
labels_dir = os.path.join(train_data_file, 'labels')                               

ann_id = 1
for idx, image_file in enumerate(os.listdir(images_dir)):
    height, width, _ = cv2.imread(os.path.join(images_dir, image_file)).shape
    
    if image_file in train_data_list:
        dataset = train_dataset
    
    elif image_file in valid_data_list:
        dataset = valid_dataset
        
    dataset['images'].append({'file_name': image_file, 'width': width, 'height': height, 'id': idx+1})
    
    image_txt = image_file.replace('.jpg','.txt').replace('.png','.txt')
    if not os.path.exists(os.path.join(labels_dir, image_txt)):
        print("%s does not exists" % image_txt)
    
    with open(os.path.join(labels_dir, image_txt), 'r') as f:
        label_info = f.readlines()
        
        for i_label in label_info:
            label = i_label.split()
            
            class_id = int(label[0])+1
            x = float(label[1])
            y = float(label[2])
            w = float(label[3])
            h = float(label[4])
            
            box_info = [x, y, w, h]
            img_size = [height, width]
            x_min, x_max, y_min, y_max = yolo2coco(box_info, img_size)
            box_w = max(0, x_max-x_min)
            box_h = max(0, y_max-y_min)
            
            dataset['annotations'].append({'segmentation': [], 'area': box_w * box_h, 'iscrowd': 0, 'image_id': idx+1, 
                                           'bbox': [x_min, y_min, box_w, box_h], 'category_id': class_id, 'id': ann_id
                                          })
            ann_id += 1
            

## 產生 annotation 資料夾

In [14]:
annotation_file = os.path.join(train_data_file, 'annotations')
if not os.path.exists(annotation_file):
    os.makedirs(annotation_file)
    
with open(os.path.join(annotation_file, 'train.json'), 'w') as json_file:
  json.dump(train_dataset, json_file)

with open(os.path.join(annotation_file, 'val.json'), 'w') as json_file:
  json.dump(valid_dataset, json_file)