In [1]:
import pandas as pd
import os
from tqdm import *
import struct
import bson
from utilities import utils

In [2]:
data_dir = "inputs/"

#train_bson_path = os.path.join(data_dir, "train.bson")
#num_train_products = 7069896

train_bson_path = os.path.join(data_dir, "train_example.bson")
num_train_products = 82

#test_bson_path = os.path.join(data_dir, "test.bson")
#num_test_products = 1768172

## Create lookup tables
The generator uses several lookup tables that describe the layout of the BSON file, which products and images are part of the training/validation sets, and so on.You only need to generate these tables once, as they get saved to CSV files.

### Lookup table for categories¶

In [3]:
categories_path = os.path.join(data_dir, "category_names.csv")
categories_df = pd.read_csv(categories_path, index_col="category_id")

# Maps the category_id to an integer index. This is what we'll use to
# one-hot encode the labels.
categories_df["category_idx"] = pd.Series(range(len(categories_df)), index=categories_df.index)

categories_df.to_csv(os.path.join(data_dir, "categories.csv"))
categories_df.head()

Unnamed: 0_level_0,category_level1,category_level2,category_level3,category_idx
category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1000021794,ABONNEMENT / SERVICES,CARTE PREPAYEE,CARTE PREPAYEE MULTIMEDIA,0
1000012764,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,ABRI FUMEUR,1
1000012776,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,ABRI VELO - ABRI MOTO,2
1000012768,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,FONTAINE A EAU,3
1000012755,AMENAGEMENT URBAIN - VOIRIE,SIGNALETIQUE,PANNEAU D'INFORMATION EXTERIEUR,4


## Read the BSON files
We store the offsets and lengths of all items, allowing us random access to the items later. <br />
Inspired by code from: https://www.kaggle.com/vfdev5/random-item-access

Note: this takes a few minutes to execute, but we only have to do it once (we'll save the table to a CSV file afterwards).

In [4]:
def read_bson(bson_path, num_records, with_categories):
    rows = {}
    with open(bson_path, "rb") as f, tqdm(total=num_records) as pbar:
        offset = 0
        while True:
            item_length_bytes = f.read(4)
            if len(item_length_bytes) == 0:
                break

            length = struct.unpack("<i", item_length_bytes)[0]

            f.seek(offset)
            item_data = f.read(length)
            assert len(item_data) == length

            item = bson.BSON(item_data).decode()
            product_id = item["_id"]
            num_imgs = len(item["imgs"])

            row = [num_imgs, offset, length]
            if with_categories:
                row += [item["category_id"]]
            rows[product_id] = row

            offset += length
            f.seek(offset)
            pbar.update()

    columns = ["num_imgs", "offset", "length"]
    if with_categories:
        columns += ["category_id"]

    df = pd.DataFrame.from_dict(rows, orient="index")
    df.index.name = "product_id"
    df.columns = columns
    df.sort_index(inplace=True)
    
    return df

In [5]:
%time train_offsets_df = read_bson(train_bson_path, num_records=num_train_products, with_categories=True)

100%|██████████| 82/82 [00:00<00:00, 59804.02it/s]

CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 7.23 ms





In [6]:
train_offsets_df.head()

Unnamed: 0_level_0,num_imgs,offset,length,category_id
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,0,6979,1000010653
1,1,6979,7318,1000010653
2,1,14297,5455,1000004079
3,1,19752,4580,1000004141
4,1,24332,6346,1000015539


In [7]:
train_offsets_df.to_csv(os.path.join(data_dir, "train_offsets.csv"))

In [8]:
# How many products? 7069896
len(train_offsets_df)

82

In [9]:
# How many categories? 5270
len(train_offsets_df["category_id"].unique())

36

In [10]:
# How many images in total? 12371293
train_offsets_df["num_imgs"].sum()

36

#### Also create a table for the offsets from the test set.

In [8]:
%time test_offsets_df = read_bson(test_bson_path, num_records=num_test_products, with_categories=False)

NameError: name 'test_bson_path' is not defined

In [None]:
test_offsets_df.head()

In [None]:
test_offsets_df.to_csv(os.path.join(data_dir, "test_offsets.csv"))

### Lookup table for test set images
Create a list containing a row for each image. If a product has more than one image, it appears more than once in this list.

In [None]:
def make_test_set(df):
    test_list = []
    for ir in tqdm(df.itertuples()):
        product_id = ir[0]
        num_imgs = ir[1]
        for img_idx in range(num_imgs):
            test_list.append([product_id, img_idx])

    columns = ["product_id", "img_idx"]
    test_df = pd.DataFrame(test_list, columns=columns)
    return test_df

In [None]:
test_images_df = make_test_set(test_offsets_df)

In [None]:
test_images_df.head()

In [None]:
print("Number of test images:", len(test_images_df))

In [None]:
test_images_df.to_csv(os.path.join(data_dir, "test_images.csv"))

## Create a random train/validation split
We split on products, not on individual images. Since some of the categories only have a few products, we do the split separately for each category.

This creates two new tables, one for the training images and one for the validation images. There is a row for every single image, so if a product has more than one image it occurs more than once in the table.

In [None]:
utils.set_results_reproducible()

#### Create dictionaries for quick lookup of category_id to category_idx mapping.

In [11]:
train_images_df, val_images_df = utils.make_val_set(categories_df, train_offsets_df, 
                                                    split_percentage=0.2, drop_percentage=0)

82it [00:00, 356407.18it/s]
100%|██████████| 82/82 [00:00<00:00, 8071.65it/s]


In [12]:
train_images_df.head()

Unnamed: 0,product_id,category_idx,img_idx
0,12,3964,0
1,76,878,0
2,76,878,1
3,53,1700,0
4,72,2520,0


In [13]:
val_images_df.head()

Unnamed: 0,product_id,category_idx,img_idx
0,11,5055,0
1,25,5055,0
2,25,5055,1
3,25,5055,2
4,25,5055,3


In [14]:
print("Number of training images:", len(train_images_df))
print("Number of validation images:", len(val_images_df))
print("Total images:", len(train_images_df) + len(val_images_df))

('Number of training images:', 100)
('Number of validation images:', 10)
('Total images:', 110)


In [15]:
train_images_df.to_csv(os.path.join(data_dir, "train_images.csv"))
val_images_df.to_csv(os.path.join(data_dir, "val_images.csv"))