In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Data Cleansing

In [3]:
import pandas as pd

annotations_file_path = '/content/drive/Shareddrives/Deep Learning/Deep Learning CSV/General Table Dataset/train_annotated.csv'
folds_file_path = '/content/drive/Shareddrives/Deep Learning/Deep Learning CSV/General Table Dataset/train_folds.csv'

# Pull information from annotations and folds csv files
annotations = pd.read_csv(annotations_file_path)
folds = pd.read_csv(folds_file_path)

print(annotations.head())
print(folds.head())

   image_id                    bbox  \
0  0101_003  [769, 945, 1301, 2028]   
1  0110_099  [269, 1652, 2022, 980]   
2  0113_013                     NaN   
3  0140_007  [698, 1781, 1083, 290]   
4  0146_281  [703, 431, 1041, 1121]   

                                       segmentation       area  height  \
0    [[769, 945, 769, 2973, 2070, 2973, 2070, 945]]  2638428.0  3300.0   
1  [[269, 1652, 269, 2632, 2291, 2632, 2291, 1652]]  1981560.0  3300.0   
2                                               NaN        NaN     NaN   
3  [[698, 1781, 698, 2071, 1781, 2071, 1781, 1781]]   314070.0  3300.0   
4    [[703, 431, 703, 1552, 1744, 1552, 1744, 431]]  1166961.0  3300.0   

    width  bbox_xmin  bbox_ymin  bbox_width  bbox_height  
0  2544.0      769.0      945.0      1301.0       2028.0  
1  2544.0      269.0     1652.0      2022.0        980.0  
2     NaN        NaN        NaN         NaN          NaN  
3  2560.0      698.0     1781.0      1083.0        290.0  
4  2544.0      703.0     

In [4]:
# filtering for images with a single datatables
folds_filtered = folds[folds["bbox_count"] == 1]
print(folds_filtered.head())

   image_id  bbox_count  source  fold
0  0101_003           1  marmot     3
1  0110_099           1  marmot     0
2  0140_007           1  marmot     4
7  0148_271           1  marmot     1
8  0148_479           1  marmot     2


In [5]:
# continuing filtering for single datatables using annotated dataset
# only need annotations_filtered now, don't need to use folds
valid_image_ids = set(folds_filtered["image_id"])

annotations_filtered = annotations[annotations["image_id"].isin(valid_image_ids)].copy()

print(annotations_filtered.head())
print(annotations_filtered.shape)

    image_id                    bbox  \
0   0101_003  [769, 945, 1301, 2028]   
1   0110_099  [269, 1652, 2022, 980]   
3   0140_007  [698, 1781, 1083, 290]   
12  0148_271  [389, 383, 1766, 1032]   
13  0148_479  [932, 425, 1198, 1081]   

                                        segmentation       area  height  \
0     [[769, 945, 769, 2973, 2070, 2973, 2070, 945]]  2638428.0  3300.0   
1   [[269, 1652, 269, 2632, 2291, 2632, 2291, 1652]]  1981560.0  3300.0   
3   [[698, 1781, 698, 2071, 1781, 2071, 1781, 1781]]   314070.0  3300.0   
12    [[389, 383, 389, 1415, 2155, 1415, 2155, 383]]  1822512.0  3300.0   
13    [[932, 425, 932, 1506, 2130, 1506, 2130, 425]]  1295038.0  3300.0   

     width  bbox_xmin  bbox_ymin  bbox_width  bbox_height  
0   2544.0      769.0      945.0      1301.0       2028.0  
1   2544.0      269.0     1652.0      2022.0        980.0  
3   2560.0      698.0     1781.0      1083.0        290.0  
12  2544.0      389.0      383.0      1766.0       1032.0  
13  2560

In [6]:
# Rescale bbox values to be in terms of the image width and height instead of pixel value
# This will lead to less issues when resizing images during the data loading

scaled_bboxes = []
for index, row in annotations_filtered.iterrows():
  x_min = row['bbox_xmin']
  y_min = row['bbox_ymin']
  bbox_width = row['bbox_width']
  bbox_height = row['bbox_height']

  im_width= row["width"]
  im_height = row["height"]

  # All bbox values are now from 0 to 1
  bbox = [x_min/im_width, y_min/im_height, bbox_width/im_width, y_min/im_height]

  scaled_bboxes.append(bbox)

In [7]:
# Declare X and Y variables
X = annotations_filtered["image_id"].to_list()
Y = scaled_bboxes

# Check we have the same number of inputs and labels
print("Number of samples in X: ", len(X))
print("Number of samples in Y: ", len(Y))

Number of samples in X:  1308
Number of samples in Y:  1308


## Data Processing
Create 70/15/15 train/val/test splits

In [8]:
train_size = int(1308*0.7)
val_size = int(1308*0.9)
test_size = int(1308*1)

X_train, Y_train = X[:train_size], Y[:train_size]
X_val, Y_val = X[train_size:val_size], Y[train_size:val_size]
X_test, Y_test = X[val_size:], Y[val_size:]

print("Number of samples in train set: ", len(X_train))
print("Number of samples in val set: ", len(X_val))
print("Number of samples in test set: ", len(X_test))

Number of samples in train set:  915
Number of samples in val set:  262
Number of samples in test set:  131


In [None]:
# TODO: create visualizations to see if distribution of placement and col/row numbers are even among splits