In [31]:
import os
import json

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

In [1]:
DATA_SRC = '/content/drive/MyDrive/Colab-Notebooks/data/zalo-traffic-sign/za_traffic_2020/traffic_train'

In [4]:
os.chdir(DATA_SRC)
os.listdir()

['images',
 'sample_images',
 'train_traffic_sign_dataset.json',
 'annotation.csv']

### data folder tree

```
data
└───images
│        123.png
|        123.txt
|        ...
|   classes.names
|   train.txt
|   test.txt
```

### generate YOLO data format from ```annotation.csv```

In [6]:
annotation = pd.read_csv(os.path.join(DATA_SRC, 'annotation.csv'))
display(annotation)

Unnamed: 0,image_id,file_name,height,width,bbox_id,bbox,category_id,supercategory,supercategory_eng
0,3,3.png,626,1622,0,"[880, 333, 19, 18]",2,Cấm dừng và đỗ,No stopping and parking
1,5,5.png,626,1622,2,"[768, 480, 9, 7]",2,Cấm dừng và đỗ,No stopping and parking
2,16,16.png,626,1622,4,"[733, 352, 7, 8]",2,Cấm dừng và đỗ,No stopping and parking
3,17,17.png,626,1622,5,"[1024, 160, 75, 72]",2,Cấm dừng và đỗ,No stopping and parking
4,18,18.png,626,1622,6,"[1138, 295, 47, 41]",2,Cấm dừng và đỗ,No stopping and parking
...,...,...,...,...,...,...,...,...,...
10995,12442,12442.png,626,1622,10951,"[507, 368, 15, 13]",7,Hiệu lệnh,Mandatory
10996,12447,12447.png,626,1622,10966,"[358, 330, 61, 54]",7,Hiệu lệnh,Mandatory
10997,12473,12473.png,626,1622,10976,"[356, 289, 17, 14]",7,Hiệu lệnh,Mandatory
10998,12503,12503.png,626,1622,10991,"[552, 227, 47, 47]",7,Hiệu lệnh,Mandatory


### generate ```<img_id>.txt``` file for each image ```<img_id>.png```

In [19]:
def convert_coco_to_yolo_bbox(coco_bbox):
    '''
    para
        coco_bbox : np.array [x_min, y_min, w, h] (4,)
    return
        yolo_bbox : np.array [x_center, y_center, w, h] (4,)
    '''
    return np.array([
        coco_bbox[0] + coco_bbox[2] / 2.0,
        coco_bbox[1] + coco_bbox[3] / 2.0,
        coco_bbox[2],
        coco_bbox[3]
    ]).astype(np.float64)

In [30]:
for _, row in annotation.iterrows():
    with open(os.path.join(DATA_SRC, 'images', '{}.txt'.format(row['image_id'])), "w") as annot_txt:
        yolo_bbox = convert_coco_to_yolo_bbox(json.loads(row['bbox']))
        annot_txt.write('{} {} {} {} {}'.format(int(row['category_id']) - 1, yolo_bbox[0], yolo_bbox[1], yolo_bbox[2], yolo_bbox[3]))
        print('write to {}'.format('{}.txt'.format(row['image_id'])))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
write to 3277.txt
write to 3284.txt
write to 3348.txt
write to 3348.txt
write to 3364.txt
write to 3373.txt
write to 3383.txt
write to 3396.txt
write to 3414.txt
write to 3421.txt
write to 3426.txt
write to 3428.txt
write to 3433.txt
write to 3480.txt
write to 3487.txt
write to 3524.txt
write to 3558.txt
write to 3559.txt
write to 3570.txt
write to 3581.txt
write to 3590.txt
write to 3591.txt
write to 3613.txt
write to 3639.txt
write to 3641.txt
write to 3664.txt
write to 3684.txt
write to 3689.txt
write to 3704.txt
write to 3709.txt
write to 3716.txt
write to 3728.txt
write to 3741.txt
write to 3758.txt
write to 3762.txt
write to 3765.txt
write to 3775.txt
write to 3800.txt
write to 3827.txt
write to 3827.txt
write to 3857.txt
write to 3870.txt
write to 3881.txt
write to 3892.txt
write to 3905.txt
write to 3922.txt
write to 3933.txt
write to 3935.txt
write to 3970.txt
write to 3970.txt
write to 3975.txt
write to 3980.txt

### train test split with stratified each label and generate ```train.txt``` and ```test.txt```

split

In [51]:
data_train, data_test = train_test_split(annotation, stratify=annotation['category_id'], test_size = 0.25, random_state=69)

print('train split proportion with label')
print(data_train['supercategory'].value_counts() / annotation['supercategory'].value_counts())
print('\ntest split proportion with label')
print(data_test['supercategory'].value_counts() / annotation['supercategory'].value_counts())

train split proportion with label
Nguy hiểm          0.750082
Cấm dừng và đỗ     0.750113
Cấm còn lại        0.749860
Cấm ngược chiều    0.750000
Hiệu lệnh          0.749511
Giới hạn tốc độ    0.750263
Cấm rẽ             0.750000
Name: supercategory, dtype: float64

test split proportion with label
Nguy hiểm          0.249918
Cấm dừng và đỗ     0.249887
Cấm còn lại        0.250140
Cấm ngược chiều    0.250000
Hiệu lệnh          0.250489
Giới hạn tốc độ    0.249737
Cấm rẽ             0.250000
Name: supercategory, dtype: float64


generate ```train.txt``` and ```test.txt```

In [52]:
with open(os.path.join(DATA_SRC, 'train.txt'), "w") as train_txt:
    train_buf = ''
    for _, row in data_train.iterrows():
        train_buf += ('data/images/{}\n'.format(row['file_name']))
    train_txt.write(train_buf[:-1]) # [:-1] remove the last '\n'
    print(train_buf[:-1])

data/images/9399.png
data/images/11419.png
data/images/8033.png
data/images/4432.png
data/images/553.png
data/images/8427.png
data/images/5108.png
data/images/4802.png
data/images/1474.png
data/images/11241.png
data/images/4996.png
data/images/6270.png
data/images/4051.png
data/images/9883.png
data/images/10052.png
data/images/5357.png
data/images/10037.png
data/images/8587.png
data/images/3314.png
data/images/11659.png
data/images/10218.png
data/images/794.png
data/images/5494.png
data/images/12423.png
data/images/10025.png
data/images/8845.png
data/images/4817.png
data/images/3127.png
data/images/3872.png
data/images/3266.png
data/images/12177.png
data/images/9442.png
data/images/8916.png
data/images/7592.png
data/images/10910.png
data/images/1654.png
data/images/9720.png
data/images/7629.png
data/images/5906.png
data/images/9621.png
data/images/10450.png
data/images/10826.png
data/images/7100.png
data/images/6252.png
data/images/3103.png
data/images/11666.png
data/images/3009.png
da

In [53]:
with open(os.path.join(DATA_SRC, 'test.txt'), "w") as test_txt:
    test_buf = ''
    for _, row in data_test.iterrows():
        test_buf += ('data/images/{}\n'.format(row['file_name']))
    test_txt.write(test_buf[:-1]) # [:-1] remove the last '\n'
    print(test_buf[:-1])

data/images/1204.png
data/images/5877.png
data/images/6342.png
data/images/6955.png
data/images/5795.png
data/images/4612.png
data/images/6098.png
data/images/1693.png
data/images/9115.png
data/images/8354.png
data/images/6647.png
data/images/7684.png
data/images/11206.png
data/images/3177.png
data/images/11490.png
data/images/3116.png
data/images/5812.png
data/images/11214.png
data/images/9575.png
data/images/10494.png
data/images/8463.png
data/images/1933.png
data/images/9685.png
data/images/12025.png
data/images/5489.png
data/images/5985.png
data/images/8622.png
data/images/4570.png
data/images/363.png
data/images/6445.png
data/images/9292.png
data/images/4204.png
data/images/1726.png
data/images/1736.png
data/images/4094.png
data/images/1736.png
data/images/4600.png
data/images/6878.png
data/images/5707.png
data/images/8136.png
data/images/10466.png
data/images/3067.png
data/images/5053.png
data/images/5827.png
data/images/1705.png
data/images/12155.png
data/images/10574.png
data/i

### generate ```classes.names```

In [46]:
annotation[['category_id' , 'supercategory_eng']].drop_duplicates().sort_values(by='category_id')

Unnamed: 0,category_id,supercategory_eng
8562,1,No entry
0,2,No stopping and parking
2221,3,No turning
7613,4,Speed limit
5826,5,No others
2777,6,Danger
9978,7,Mandatory


In [47]:
with open(os.path.join(DATA_SRC, 'classes.names'), "w") as classes_names:
    category_df = annotation[['category_id' , 'supercategory_eng']].drop_duplicates().sort_values(by='category_id')
    classes_buf = ''
    for _, row in category_df.iterrows():
        classes_buf += ('{}\n'.format(row['supercategory_eng'].replace(' ', '_')))
    classes_names.write(classes_buf[:-1]) # [:-1] remove the last '\n'
    print(classes_buf[:-1])

No_entry
No_stopping_and_parking
No_turning
Speed_limit
No_others
Danger
Mandatory


### ```zip``` all relevant files

In [54]:
! zip -r data.zip images classes.names train.txt test.txt 

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  adding: images/10006.png (deflated 0%)
  adding: images/10029.png (deflated 0%)
  adding: images/10027.png (deflated 0%)
  adding: images/10025.png (deflated 0%)
  adding: images/10024.png (deflated 0%)
  adding: images/10033.png (deflated 0%)
  adding: images/1000.png (deflated 0%)
  adding: images/10014.png (deflated 0%)
  adding: images/1003.png (deflated 0%)
  adding: images/10016.png (deflated 0%)
  adding: images/10030.png (deflated 0%)
  adding: images/10011.png (deflated 0%)
  adding: images/10018.png (deflated 0%)
  adding: images/10008.png (deflated 0%)
  adding: images/10005.png (deflated 0%)
  adding: images/10035.png (deflated 0%)
  adding: images/1006.png (deflated 0%)
  adding: images/10071.png (deflated 0%)
  adding: images/10042.png (deflated 0%)
  adding: images/10040.png (deflated 0%)
  adding: images/10039.png (deflated 0%)
  adding: images/10068.png (deflated 0%)
  adding: images/10057.png (deflated