<a href="https://colab.research.google.com/github/diane-park/TableSnap/blob/main/Table_Detection_Milestone_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Introduction to Dataset

The dataset that we are using is the [General Table Detection Dataset](https://www.kaggle.com/datasets/rhtsingh/general-table-recognition-dataset/data?select=train.csv), which contains scanned documents of one or more tables and corresponding table annotations (bounding box coordinates for table regions). This dataset is accessible off of Kaggle.

## Accessing the Dataset and Installing Dependencies

In [1]:
# Create a requirements txt
requirements = """
kagglehub
pandas==1.5.3
torch==2.7.0+cu126
opencv-python==4.5.3.56
matplotlib==3.5.1
git+https://github.com/ultralytics/yolov5.git
"""

with open('requirements.txt', 'w') as f:
    f.write(requirements)

# Install dependencies
!pip install -U -r requirements.txt

Collecting git+https://github.com/ultralytics/yolov5.git (from -r requirements.txt (line 7))
  Cloning https://github.com/ultralytics/yolov5.git to /tmp/pip-req-build-jmd1nx7z
  Running command git clone --filter=blob:none --quiet https://github.com/ultralytics/yolov5.git /tmp/pip-req-build-jmd1nx7z
  Resolved https://github.com/ultralytics/yolov5.git to commit fe1d4d9947735473006c68513168fef093ff17ce
  Installing build dependencies ... [?25l[?25hdone
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Getting requirements to build wheel ... [?25l[?25herror
[1;31merror[0m: [1msubprocess-exited-with-error[0m

[31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
[31m│[0m exit code: [1

In [2]:
import kagglehub
import os
import shutil

In [3]:
# Download latest version
path = kagglehub.dataset_download("rhtsingh/general-table-recognition-dataset")

print("Path to dataset files:", path)
os.listdir(path)

Path to dataset files: /kaggle/input/general-table-recognition-dataset


['train_folds.csv',
 'Orig_Annotations-20220104T075522Z-001',
 'create_folds.py',
 'Orig_Image-20220104T074306Z-001',
 'train_annotated.csv',
 'General Dataset-20220104T073035Z-001',
 'train.csv']

## Data Cleansing

In [4]:
import pandas as pd

annotations_file_path = path + '/train_annotated.csv'
folds_file_path = path + '/train_folds.csv'

# Pull information from annotations and folds csv files
annotations = pd.read_csv(annotations_file_path)
folds = pd.read_csv(folds_file_path)

print(annotations.head())
print(folds.head())

   image_id                    bbox  \
0  0101_003  [769, 945, 1301, 2028]   
1  0110_099  [269, 1652, 2022, 980]   
2  0113_013                     NaN   
3  0140_007  [698, 1781, 1083, 290]   
4  0146_281  [703, 431, 1041, 1121]   

                                       segmentation       area  height  \
0    [[769, 945, 769, 2973, 2070, 2973, 2070, 945]]  2638428.0  3300.0   
1  [[269, 1652, 269, 2632, 2291, 2632, 2291, 1652]]  1981560.0  3300.0   
2                                               NaN        NaN     NaN   
3  [[698, 1781, 698, 2071, 1781, 2071, 1781, 1781]]   314070.0  3300.0   
4    [[703, 431, 703, 1552, 1744, 1552, 1744, 431]]  1166961.0  3300.0   

    width  bbox_xmin  bbox_ymin  bbox_width  bbox_height  
0  2544.0      769.0      945.0      1301.0       2028.0  
1  2544.0      269.0     1652.0      2022.0        980.0  
2     NaN        NaN        NaN         NaN          NaN  
3  2560.0      698.0     1781.0      1083.0        290.0  
4  2544.0      703.0     

In [5]:
# filtering for images with a single datatables
folds_filtered = folds[folds["bbox_count"] == 1]
print(folds_filtered.head())

   image_id  bbox_count  source  fold
0  0101_003           1  marmot     3
1  0110_099           1  marmot     0
2  0140_007           1  marmot     4
7  0148_271           1  marmot     1
8  0148_479           1  marmot     2


In [6]:
# continuing filtering for single datatables using annotated dataset
# only need annotations_filtered now, don't need to use folds
valid_image_ids = set(folds_filtered["image_id"])

annotations_filtered = annotations[annotations["image_id"].isin(valid_image_ids)].copy()

print(annotations_filtered.head())
print(annotations_filtered.shape)

    image_id                    bbox  \
0   0101_003  [769, 945, 1301, 2028]   
1   0110_099  [269, 1652, 2022, 980]   
3   0140_007  [698, 1781, 1083, 290]   
12  0148_271  [389, 383, 1766, 1032]   
13  0148_479  [932, 425, 1198, 1081]   

                                        segmentation       area  height  \
0     [[769, 945, 769, 2973, 2070, 2973, 2070, 945]]  2638428.0  3300.0   
1   [[269, 1652, 269, 2632, 2291, 2632, 2291, 1652]]  1981560.0  3300.0   
3   [[698, 1781, 698, 2071, 1781, 2071, 1781, 1781]]   314070.0  3300.0   
12    [[389, 383, 389, 1415, 2155, 1415, 2155, 383]]  1822512.0  3300.0   
13    [[932, 425, 932, 1506, 2130, 1506, 2130, 425]]  1295038.0  3300.0   

     width  bbox_xmin  bbox_ymin  bbox_width  bbox_height  
0   2544.0      769.0      945.0      1301.0       2028.0  
1   2544.0      269.0     1652.0      2022.0        980.0  
3   2560.0      698.0     1781.0      1083.0        290.0  
12  2544.0      389.0      383.0      1766.0       1032.0  
13  2560

In [7]:
# Rescale bbox values to be in terms of the image width and height instead of pixel value
# This will lead to less issues when resizing images during the data loading

scaled_bboxes = []
for index, row in annotations_filtered.iterrows():
  x_min = row['bbox_xmin']
  y_min = row['bbox_ymin']
  bbox_width = row['bbox_width']
  bbox_height = row['bbox_height']

  im_width= row["width"]
  im_height = row["height"]

  # All bbox values are now from 0 to 1
  bbox = [x_min/im_width, y_min/im_height, bbox_width/im_width, bbox_height/im_height]

  scaled_bboxes.append(bbox)

In [8]:
# Declare X and Y variables
X = annotations_filtered["image_id"].to_list()
Y = scaled_bboxes

# Check we have the same number of inputs and labels
print("Number of samples in X: ", len(X))
print("Number of samples in Y: ", len(Y))

Number of samples in X:  1308
Number of samples in Y:  1308


## Data Processing
Create 70/20/10 train/val/test splits

In [9]:
train_size = int(1308*0.7)
val_size = int(1308*0.9)
test_size = int(1308*1)

X_train, Y_train = X[:train_size], Y[:train_size]
X_val, Y_val = X[train_size:val_size], Y[train_size:val_size]
X_test, Y_test = X[val_size:], Y[val_size:]

print("Number of samples in train set: ", len(X_train))
print("Number of samples in val set: ", len(X_val))
print("Number of samples in test set: ", len(X_test))

Number of samples in train set:  915
Number of samples in val set:  262
Number of samples in test set:  131


In [10]:
# Target YOLO-style structure
target_train_image_dir = 'dataset/images/train'
target_train_label_dir = 'dataset/labels/train'

target_val_image_dir = 'dataset/images/val'
target_val_label_dir = 'dataset/labels/val'

target_test_image_dir = 'dataset/images/test'
target_test_label_dir = 'dataset/labels/test'


# create the directories to write to later
os.makedirs(target_train_image_dir, exist_ok=True)
os.makedirs(target_train_label_dir, exist_ok=True)
os.makedirs(target_val_image_dir, exist_ok=True)
os.makedirs(target_val_label_dir, exist_ok=True)
os.makedirs(target_test_image_dir, exist_ok=True)
os.makedirs(target_test_label_dir, exist_ok=True)

In [11]:
def process_data(ids, bboxes, target_image_dir, target_label_dir):
    for id, bbox in zip(ids, bboxes):
        # Determine which image file to copy (PNG or JPG)
        if os.path.exists(path + '/Orig_Image-20220104T074306Z-001/Orig_Image/'  + id + '.png'):
            shutil.copy(path + '/Orig_Image-20220104T074306Z-001/Orig_Image/'  + id + '.png', target_image_dir + '/' + id + '.png')
        else:
            shutil.copy(path + '/Orig_Image-20220104T074306Z-001/Orig_Image/'  + id + '.jpg', target_image_dir + '/' + id + '.jpg')

        # Create label file
        new_file = target_label_dir + '/' + id + '.txt'

        # Calculate bounding box in YOLO format (x_center, y_center, width, height)
        x_center = bbox[0] + bbox[2]/2
        y_center = bbox[1] + bbox[3]/2
        width = bbox[2]
        height = bbox[3]

        # Write the label file
        with open(new_file, "w") as file:
            file.write(f"0 {x_center} {y_center} {width} {height}")

# TEST
process_data(X_test, Y_test, target_test_image_dir, target_test_label_dir)

# VALIDATION
process_data(X_val, Y_val, target_val_image_dir, target_val_label_dir)

# TRAIN
process_data(X_train, Y_train, target_train_image_dir, target_train_label_dir)

In [12]:
# TODO: create visualizations to see if distribution of placement and col/row numbers are even among splits

## Baseline Model

We will use YOLOv5 as our baseline model for obejct detection. We thought YOLO would be a good baseline because it pads images for you and resizes them so we can have images of different sizes.

In [13]:
# Clone YOLOv5 repository
!git clone https://github.com/ultralytics/yolov5.git
%cd yolov5

# Install dependencies
!pip install -U -r requirements.txt

import torch
print(torch.cuda.is_available())  # Should print True if GPU is available

fatal: destination path 'yolov5' already exists and is not an empty directory.
/content/yolov5
Collecting numpy>=1.23.5 (from -r requirements.txt (line 7))
  Using cached numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
True


In [14]:
# Create data.yaml file

data_yaml_content = """
train: /content/dataset/images/train
val: /content/dataset/images/val
test: /content/dataset/images/test

nc: 1
names: ['datatables']
"""

# Specify the path where to save the data.yaml file
data_yaml_path = '/content/data.yaml'

# Write the content to the data.yaml file
with open(data_yaml_path, 'w') as file:
    file.write(data_yaml_content)

print(f"data.yaml file created at {data_yaml_path}")

data.yaml file created at /content/data.yaml


You will have the option to enter your W&B account in order to visualize results. Please input your account info, if possible.

Please use a GPU to accelerate the training. In Colab, you can enable this in **Runtime** -> **Change Runtime Type** -> Under **Hardware accelerator**, select **GPU** -> Save

In [15]:
# train from the YOLOv5 repo
!python train.py --img 640 --batch 16 --epochs 10 --data /content/data.yaml --cfg models/yolov5s.yaml --weights yolov5s.pt --name yolo_model

# evaluate the model
!python val.py --weights runs/train/yolo_model/weights/best.pt --data /content/data.yaml --img 640

2025-04-27 17:54:28.557274: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745776468.796374    3292 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745776468.860175    3292 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice: (30 second timeout) 
[34m[1mwandb[0m: W&B disabled due to login timeout.
[34m[1mtrain: [0mweights=yolov5s.pt, cfg=models

In [16]:
# Examine loss, precision, and recall during training
!cat runs/train/yolo_model/results.csv

               epoch,      train/box_loss,      train/obj_loss,      train/cls_loss,   metrics/precision,      metrics/recall,     metrics/mAP_0.5,metrics/mAP_0.5:0.95,        val/box_loss,        val/obj_loss,        val/cls_loss,               x/lr0,               x/lr1,               x/lr2
                   0,             0.10054,            0.028101,                   0,           0.0026081,             0.78244,            0.011335,           0.0026905,            0.040288,            0.014197,                   0,            0.070517,           0.0032759,           0.0032759
                   1,            0.064339,            0.023571,                   0,             0.44087,             0.47328,             0.41449,             0.17798,            0.028088,            0.012817,                   0,            0.039863,           0.0059549,           0.0059549
                   2,            0.055182,            0.019024,                   0,             0.25402,             

#Deep Learning Model


In [19]:
# go to the YOLOv5 repo
%cd yolov5

# fine-tune from our baseline model with more epochs, and a lower learning rate
!python train.py --img 640 --batch 16 --epochs 50 --data /content/data.yaml --cfg models/yolov5s.yaml --weights runs/train/yolo_model/weights/best.pt --name fine_tuned_model --hyp data/hyps/hyp.scratch-low.yaml
!python val.py --weights runs/train/fine_tuned_model/weights/best.pt --data /content/data.yaml --img 640


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
       8/49      4.19G    0.02485   0.008827          0          2        640: 100% 58/58 [01:06<00:00,  1.15s/it]
                 Class     Images  Instances          P          R      mAP50   mAP50-95: 100% 9/9 [00:05<00:00,  1.60it/s]
                   all        262        262      0.895      0.878      0.912      0.671

      Epoch    GPU_mem   box_loss   obj_loss   cls_loss  Instances       Size
  with torch.cuda.amp.autocast(amp):
  with tor

In [20]:
# Examine loss, precision, and recall during training
!cat runs/train/fine_tuned_model/results.csv

               epoch,      train/box_loss,      train/obj_loss,      train/cls_loss,   metrics/precision,      metrics/recall,     metrics/mAP_0.5,metrics/mAP_0.5:0.95,        val/box_loss,        val/obj_loss,        val/cls_loss,               x/lr0,               x/lr1,               x/lr2
                   0,            0.021077,           0.0096947,                   0,             0.90961,             0.88339,             0.91731,             0.74744,           0.0092811,           0.0046054,                   0,            0.070517,           0.0032759,           0.0032759
                   1,             0.02432,            0.009765,                   0,             0.90189,             0.90076,             0.94095,             0.70295,            0.011155,           0.0043583,                   0,            0.040386,           0.0064783,           0.0064783
                   2,            0.029565,           0.0097657,                   0,             0.48314,             