In [1]:
import pandas as pd
import tarfile
import os
import shutil

Unzip tar file containing the data

In [2]:
fname = "DLCV_logo_project.tar.gz"
destination_dir = "./"

tar = tarfile.open(fname)
tar.extractall(destination_dir)
tar.close()

Import annotations

In [3]:
seed = 12
df = pd.read_csv("./DLCV_logo_project/annot_train.csv")
df = df.sample(frac=1, random_state=seed)

Extract annotations of the 5 compulsory logos

In [4]:
comp_df = df.copy()
comp_df.head()

Unnamed: 0,photo_filename,width,height,class,xmin,ymin,xmax,ymax
41717,houston_1715220490095162382_20180215.jpg,1080,1080,Chanel,602,144,975,399
36371,phoenix_1798704855302061580_20180610.jpg,1080,720,Mercedes-Benz,510,379,568,439
21356,la_1229762905742294064_20160416.jpg,1080,1349,Under Armour,647,784,755,930
35719,miami_1267757845559437345_20160608.jpg,1080,809,Mercedes-Benz,903,379,971,448
44000,houston_1453054272111782394_20170218.jpg,480,480,Intimissimi,27,29,314,329


Remove space and special characters from logo names

In [5]:
comp_df["class"] = comp_df["class"].apply(lambda x: ''.join(char for char in x if char.isalnum()))
comp_df

Unnamed: 0,photo_filename,width,height,class,xmin,ymin,xmax,ymax
41717,houston_1715220490095162382_20180215.jpg,1080,1080,Chanel,602,144,975,399
36371,phoenix_1798704855302061580_20180610.jpg,1080,720,MercedesBenz,510,379,568,439
21356,la_1229762905742294064_20160416.jpg,1080,1349,UnderArmour,647,784,755,930
35719,miami_1267757845559437345_20160608.jpg,1080,809,MercedesBenz,903,379,971,448
44000,houston_1453054272111782394_20170218.jpg,480,480,Intimissimi,27,29,314,329
...,...,...,...,...,...,...,...,...
36482,sf_1488603129770522889_20170408.jpg,1080,781,MercedesBenz,350,307,482,459
40177,chicago_1277616105368141391_20160621.jpg,1080,718,CocaCola,101,251,176,295
19709,sf_1347260241088062451_20160925.jpg,1080,1080,Adidas,360,90,436,152
38555,phoenix_1804075524908051981_20180618.jpg,1080,1350,NFL,655,779,728,867


Store the new dataframe into a csv. We need to store it into a new csv because we are removing the spaces between df classes names.

Create folder storing data 

In [6]:
path = "./data"

try:
    os.mkdir(path)
except:
    pass

In [13]:
comp_df.to_csv("./data/annotation_train_without_spaces_v1.csv", index=False)

Create a folder containing the training images of compulsory logos only

In [7]:
dest_folder = "./data/train"
source_folder = "./DLCV_logo_project/train/"

try:
    os.mkdir(dest_folder)
except:
    pass

filenames = comp_df["photo_filename"].values

for f in filenames:
    try:
        shutil.copy(source_folder + f, dest_folder)
    except:
        pass

In [8]:
n_files = len([f for f in os.listdir(dest_folder)])
print(f"Train folder contains {n_files} images.")

Train folder contains 38913 images.


Split training images into 5 subfolders to perform annotations on Roboflow

In [9]:
# Create dictionary containing subfolder names as keys and a list of files as value
subfolders = ["Reb", "Elio", "Eline", "AleL", "AleC"]
subfolders_dict = {}
subfolder_size = int(len(comp_df) * 0.2)
temp_df = comp_df.copy()

for s in subfolders[:-1]:
    subfolder_pics = temp_df.sample(n=subfolder_size, random_state=seed)["photo_filename"].values
    subfolders_dict[s] = subfolder_pics
    temp_df = temp_df[~(temp_df["photo_filename"].isin(subfolder_pics))]

subfolders_dict[subfolders[-1]] = temp_df["photo_filename"].values

In [10]:
# Create subfolders directory
try:
    os.mkdir("./data/subfolders")
except:
    pass

In [11]:
# For each key in the subfolders dictionary
source_folder = "./data/train/"
for folder_name, filenames in subfolders_dict.items():
    # 1. Create subfolder corresponding to the key
    dest_folder = "./data/subfolders/" + folder_name
    try:
        os.mkdir(dest_folder)
    except:
        pass
    # 2. Copy files contained in the dictionary value corresponding to that key to the subfolder
    for f in filenames:
        try:
            shutil.copy(source_folder + f, dest_folder)
        except:
            pass


Zip the subfolders directory

In [12]:
#shutil.make_archive("./subfolders", "zip", "./data/subfolders")

'c:\\Users\\rebec\\Documents\\GitHub\\ObjectRecognition\\LogoDetection_DSBAProject\\training_process\\autocomposer_for_training\\subfolders.zip'