In [None]:
! pip install transformers timm evaluate accelerate

In [1]:
import os
from pathlib import Path 
from PIL import Image
import cv2
import numpy as np 
import pandas as pd
import torch
from torch import nn
import evaluate
from huggingface_hub import hf_hub_download
from transformers import AutoImageProcessor, TableTransformerForObjectDetection


In [2]:
TRAIN_DIR = Path("kaggle/input/oxml-2023-x-ml-cases-table-detector/task2/data/train/")
TRAIN_IMAGES_DIR = TRAIN_DIR / "images/"
TRAIN_LABELS_DIR = TRAIN_DIR / "labels/"

In [3]:
def get_labels(label_file): 
    with open(Path(TRAIN_LABELS_DIR) / label_file) as file: 
        return file.readlines()
    

def draw_bounding_box(doc_prefix):
    """utils function that returns document image along with the bouding box"""
    image_file = doc_prefix+".jpg"
    img = cv2.imread(str(TRAIN_IMAGES_DIR / image_file))
    dh, dw, _ = img.shape
    
    coordinates = get_labels(doc_prefix+".txt")
    
    for bounding_box in coordinates:
        _, x, y, width, height = map(float, bounding_box.split(' '))

        left = int((x - width / 2) * dw)
        right = int((x + width / 2) * dw)
        top = int((y - height / 2) * dh)
        bottom = int((y + height / 2) * dh)

        cv2.rectangle(img, (left, top), (right, bottom), (0, 0, 255), 2)
        # Convert the image back to PIL format
        image_with_bbox = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        
    return image_with_bbox

# 1. Transformer Zero-Shot Baseline

In [4]:
from transformers import AutoModelForObjectDetection

In [5]:
from csv import DictWriter

In [6]:
# directories
DATA_DIR = "kaggle/input/oxml-2023-x-ml-cases-table-detector/task2/data"
TEST_DIR = "test"
TRAIN_DIR = "train"
IMG_DIR = "images"
LABEL_DIR = "labels"
SUB_FILE = "kaggle/input/oxml-2023-x-ml-cases-table-detector/task2/submission_sample.csv"

In [7]:
def get_filenames(filepath):
    files = []
    for (dirpath, dirnames, filenames) in os.walk(filepath):
        files.extend(filenames)
        break
    return files
filenames = get_filenames(os.path.join(DATA_DIR, TEST_DIR, IMG_DIR))

In [8]:
all_img_files = [os.path.join(DATA_DIR, TEST_DIR, IMG_DIR, f) for f in filenames]

In [9]:
modelname = "microsoft/table-transformer-detection"
image_processor = AutoImageProcessor.from_pretrained(modelname)
model = AutoModelForObjectDetection.from_pretrained(modelname)

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


In [20]:
sub_results = []
for i, img_path in enumerate(all_img_files):
    img = Image.open(img_path).convert("RGB")
    print(f"{(i+1)/len(all_img_files):.2%}", end='\r')
    tensor_input = image_processor(images=img, return_tensors="pt")
    output_tensor = model(**tensor_input) # outputs (center_x, center_y, width, height)
    img_size = img.size # is (width, height) fmt
    target_size = torch.tensor([img_size[::-1]]) # must be (height, width) fmt
    # post_process output format :: (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format
    results = image_processor.post_process_object_detection(output_tensor,
                                                                threshold=0.9,
                                                                target_sizes=target_size)[0]
    sub_results.append((results, img_size))

50.70%

# Process Result

In [21]:
def to_sub_format(c):
    """
    changes: top_left_x, top_left_y, bottom_right_x, bottom_right_y
    to: center_x, center_y, width, height
    """
    return list(
        ((c[0]+(c[2]-c[0])/2)/c[4], 
        (c[1]+(c[3]-c[1])/2)/c[5],
        (c[2]-c[0])/c[4],
        (c[3]-c[1])/c[5])
    )

In [22]:
coords = []
for f, i in zip(filenames, sub_results):
    if len(i[0]["boxes"]) != 0:  
        coord = i[0]["boxes"].detach().numpy().flatten().tolist()+ list(i[-1])
        coord = to_sub_format(coord)
    else:
        coord = [0]*4
    
    coords.append(coord)

In [23]:
df_sub_coords = pd.DataFrame(coords, columns=['x', 'y', 'width', 'height'])
df_sub_coords["doc_id"] = filenames

In [24]:
df_submission = pd.read_csv(SUB_FILE)

In [25]:
df_final = df_sub_coords.copy()
df_final = df_submission[["doc_id"]].merge(df_final, on="doc_id", how="left")
df_final = df_final.fillna(0)

In [32]:
df_final.head()

Unnamed: 0,x,y,width,height,doc_id
0,0.501449,0.489871,0.766514,0.704771,doc_45_10.jpg
1,0.503713,0.385183,0.533592,0.419413,doc_45_7.jpg
2,5.166678,0.671626,8.311664,1.004202,doc_43_42.jpg
3,0.0,0.0,0.0,0.0,doc_36_27.jpg
4,0.526616,0.552478,0.772723,0.211361,doc_42_68.jpg


# Save submission

In [None]:
df_final.to_csv(f"submission.csv", index=False)