Generate an intermediate csv file for the objects we are trying to identify

In [2]:
from tools import get_data_file
from glob import glob
from pathlib import Path
import pandas as pd
from PIL import Image
import os
from tools.utils import relative2absolute
from sklearn.model_selection import train_test_split

## Load and select classes

In [3]:
classes_to_keep = set(["valentina", "valentina-negra", "botanera"])

In [6]:
with open(Path('../data/raw/annotations/classes.txt')) as readable:
    classes = { i:tag.strip() for i, tag in enumerate(readable) }
    rev_class = {v:k for k,v in classes.items()}

In [7]:
ids_to_keep = set([ rev_class[k] for k in classes_to_keep ])
print(ids_to_keep)

{0, 2, 3}


In [8]:
def txt_to_frame(txt, headers=False):
    df = pd.read_csv(txt, sep=' ', header=None)
    if headers:
        df.columns = ['class', 'cx', 'cy', 'width', 'height']
    return df

annotation_files = [annotation for annotation in Path('../data/raw/annotations/').glob('[!classes]*.txt')]
filtered_annotations = []
annotation = None
for annotation_file in annotation_files:
    annotation = txt_to_frame(annotation_file)
    classes_in_image = set(annotation[0].values)
    intersection = classes_in_image.intersection(ids_to_keep)
    if intersection:
        filtered_annotations.append(annotation_file)
len(annotation_files), len(filtered_annotations)

(327, 301)

In [10]:
train_annotations, test_annotations = train_test_split(filtered_annotations)
print(len(train_annotations), len(test_annotations))

225 76


## Turn annotations into CSV

In [18]:
def annotation_to_frame(annotations):
    in_collection = []
    for annotation_file in annotations:
        image_path = Path('../data/raw/images/', f'{annotation_file.stem}.jpg')
        im = Image.open(image_path)
        values = []
        for i, row in txt_to_frame(annotation_file, headers=True).iterrows():
            class_id = row['class']
            if class_id  in ids_to_keep:
                x, y, w, h = relative2absolute(row['cx'], row['cy'], row['width'], row['height'], 
                                               img_w=im.size[0], img_h=im.size[1])
                
                values.append([str(image_path), classes[class_id], w, h, x, y, x+w, y+h ])
        in_collection.extend(values)
    labels = pd.DataFrame(in_collection, columns=['filename','class',  'width', 'height', 'xmin', 'ymin', 'xmax', 'ymax'])
    return labels
                
        
train_frame = annotation_to_frame(train_annotations)
test_frame = annotation_to_frame(test_annotations)

In [19]:
train_frame.head()

Unnamed: 0,filename,class,width,height,xmin,ymin,xmax,ymax
0,../data/raw/images/64218337_432963984193980_72...,valentina,121,250,271,0,392,250
1,../data/raw/images/56985623_136113504180235_25...,valentina,71,210,157,196,228,406
2,../data/raw/images/51228004_2554490001292628_1...,valentina-negra,107,304,191,179,298,483
3,../data/raw/images/11187023_1628683260702660_8...,botanera,108,375,140,22,248,397
4,../data/raw/images/11187023_1628683260702660_8...,valentina,101,273,246,118,347,391


In [20]:
test_frame.head()

Unnamed: 0,filename,class,width,height,xmin,ymin,xmax,ymax
0,../data/raw/images/58408882_136097377469569_33...,valentina,30,85,217,275,247,360
1,../data/raw/images/12027727_898240143584297_56...,valentina-negra,129,365,307,114,436,479
2,../data/raw/images/11191051_780300458735533_15...,botanera,135,212,150,13,285,225
3,../data/raw/images/60065865_167479797603427_60...,valentina,101,203,105,0,206,203
4,../data/raw/images/996895_493572574051058_3161...,valentina,161,453,166,23,327,476


In [26]:
train_frame.groupby('class')['class'].count().to_dict()

{'botanera': 27, 'valentina': 220, 'valentina-negra': 76}

In [27]:
test_frame.groupby('class')['class'].count().to_dict()

{'botanera': 6, 'valentina': 70, 'valentina-negra': 25}

In [None]:
train.to_csv(get_data_file('interim', 'train.csv'))
test.to_csv(get_data_file('interim', 'test.csv'))

Now, it is possible to use [Swaini's script](https://raw.githubusercontent.com/Swaini/object_detection_retraining/master/generate_tfrecord.py), located in `src/external/generate_tfrecord.py`, usage, from the root of this repo:

**Do not forget to edit the file `generate_tfrecord.py` to add all your classes**

```
PYTHONPATH=src python src/external/generate_tfrecord.py --input_csv=data/interim/train.csv  --output_tfrecord=data/interim/train.record

PYTHONPATH=src python src/external/generate_tfrecord.py --input_csv=data/interim/test.csv  --output_tfrecord=data/interim/test.record
```