In [35]:
import polars as pl
import os
import cv2
from figure import detect_subfigure_labels, dummy_detect_label_letter, classify_subfigures, save_subfigures, show_image_with_bboxes

In [30]:
import re

def count_subfigures(caption):
    patterns = [
        r"(?:^|\.\s)([a-z](?=,))", # a, 
        r"(?:^|\.\s|,\s)([a-z])\.", # a.
        r"(?:^|\.\s|,\s)\(([a-z])\)", # (a)
        r"\(([a-z])-([a-z])\)", # (a-b)
        r"\(([a-z])\sand\s([a-z])\)", # (a and b)
        r"\,([a-z])\,", # ,a,
        r"(?:^|\.\s)[a-z]–([a-z])" # a–b -- captures the second one
    ]
    all_patterns = r"|".join(patterns)
    matches = re.finditer(all_patterns, caption, re.IGNORECASE)
    letters = []
    for match in matches:
        groups = [group.lower() for group in match.groups() if group is not None]
        letters.append(max(groups))

    if len(letters) == 0:
        return 1
    return ord(max(letters)) - ord('a') + 1

sample_caption = "a, UMAP shows CTB and STB nuclei profiled with integration of snRNA-seq and snATAC-seq in early pregnancy. b, Heatmap shows pseudotime ordering of the 23,746 DARs in early pregnancy (left). Zoomed-in genomic tracks show the cis-element accessibility of representative genes. The normalized accessibility of DARs is presented with color intensities. c, TF-mining heatmap shows candidate master TFs of each nucleus type. The NES and expression z score are presented with dot color and dot size, respectively. Dots with bold edges show selected TFs used for network construction. d, Regulatory networks with three layers (TF, cis-element and target gene) represent the cis-regulatory architecture covering the complete STB differentiation process. The gene percentage, peak accessibility and gene expression score are presented with circle sizes, edge width and color intensities, respectively. e, smFISH staining (left) and fluorescence intensities (right) show the colocalization of TFs (STAT5A and FOSL2) and representative genes (PAPPA and FLT1) of eSTB mature 1 and eSTB mature 2. hCG, STB marker. The white dotted lines show the outline of CTB. The white lines represent the plotted tracks of fluorescence intensity. f, UMAP shows CTB and STB nuclei profiled with integration of snRNA-seq of STB-CT30, STB-BL and placenta villi in early pregnancy. g, Heatmaps show marker gene expression in STB-BL, STB-CT30 and placental villi in early pregnancy. The expression levels are presented with color intensities. Binarized gene expression levels (positive or negative) are calculated and visualized beside heatmaps for the two marker genes, PAPPA and FLT1. h, Pseudotime ordering shows three differential trajectories of CTB and STB nuclei (upper left). The differentiation time is presented with color intensities. Quantitative classifications for each cluster (upper right) and specific pseudotime of cluster 9 and cluster 7 on the integrated UMAP (bottom) are shown. i, Heatmap shows the different expression patterns of marker genes, master TFs and representative hormones among in vitro trophoblast models and in vivo placental villi in early pregnancy. The z scores are presented with color intensities. Examples of genes with similar expression patterns are highlighted in bold black, and those with different patterns are colored in red."
print(count_subfigures(sample_caption))

9


In [3]:
image_folder = "imgs"
image_files = [os.path.join(image_folder, file) for file in os.listdir(image_folder) if file.endswith((".png", ".jpg", ".gif"))]

image_files = ["imgs/10335933_Fig5.jpg"]
for image_file in image_files:
    bounding_boxes = detect_subfigure_labels(image_file)
    # show_image_with_bboxes(image_file, bounding_boxes)
    subfigure_info, concate_img = dummy_detect_label_letter(image_file, bounding_boxes)
    # show_labels_on_image(image_file, subfigure_info, save=False, show=True)
    figure_json = classify_subfigures(image_file, subfigure_info, concate_img)
    save_subfigures(image_file, figure_json, out_dir="test")


image 1/1 /Users/etowah/projects/forks/exsclaim/imgs/10335933_Fig5.jpg: 640x544 13 labels, 146.3ms
Speed: 5.6ms preprocess, 146.3ms inference, 7.6ms postprocess per image at shape (1, 3, 640, 544)


  x_shift = dtype(np.broadcast_to(


In [42]:
def non_max_suppression(boxes, scores, threshold):
    """
    Perform non-max suppression on a set of bounding boxes and corresponding scores.

    :param boxes: a list of bounding boxes in the format [xmin, ymin, xmax, ymax]
    :param scores: a list of corresponding scores
    :param threshold: the IoU (intersection-over-union) threshold for merging bounding boxes
    :return: a list of indices of the boxes to keep after non-max suppression
    """
    # Sort the boxes by score in descending order
    order = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
    keep = []
    while order:
        i = order.pop(0)
        keep.append(i)
        for j in order:
            # Calculate the IoU between the two boxes
            intersection = max(0, min(boxes[i][2], boxes[j][2]) - max(boxes[i][0], boxes[j][0])) * \
                           max(0, min(boxes[i][3], boxes[j][3]) - max(boxes[i][1], boxes[j][1]))
            union = (boxes[i][2] - boxes[i][0]) * (boxes[i][3] - boxes[i][1]) + \
                    (boxes[j][2] - boxes[j][0]) * (boxes[j][3] - boxes[j][1]) - intersection
            iou = intersection / union

            # Remove boxes with IoU greater than the threshold
            if iou > threshold:
                order.remove(j)
    return keep

In [46]:
figure_data = pl.read_parquet("figure_data.parquet")
out_dir = "test"
failed = []

for row in figure_data.rows():
    base_dir = "/Users/etowah/projects/PMC-figure-downloader/img"
    # Access the values of each column in the current row
    figure_name = f"{row[0]}_{row[1]}"
    img_path = f"{base_dir}/{figure_name}.jpg"
    caption = row[4]
    n_subfigures = count_subfigures(caption)
    print(n_subfigures)
    if n_subfigures == 1:
        print(caption)
        image = cv2.imread(img_path)
        cv2.imwrite(f"{out_dir}/{figure_name}.jpg", image)
    else:
        bboxes = detect_subfigure_labels(img_path)
        scores = [box[4] for box in bboxes]
        keep = non_max_suppression(bboxes, scores, 0.5)
        bounding_boxes = [bboxes[i] for i in keep]
        # show_image_with_bboxes(image_file, bounding_boxes)
        if (len(bounding_boxes) != n_subfigures):
            print(f"Subfigure count mismatch: {len(bounding_boxes)} detected, {n_subfigures} expected")
            # show_image_with_bboxes(img_path, bounding_boxes)
            failed.append([row[0], row[1]])
            continue
        subfigure_info, concate_img = dummy_detect_label_letter(img_path, bounding_boxes)
        # show_labels_on_image(image_file, subfigure_info, save=False, show=True)
        figure_json = classify_subfigures(img_path, subfigure_info, concate_img)
        save_subfigures(img_path, figure_json, out_dir=out_dir)

    
print(failed)


4

image 1/1 /Users/etowah/projects/PMC-figure-downloader/img/10937393_Fig1.jpg: 640x544 5 labels, 77.1ms
Speed: 2.2ms preprocess, 77.1ms inference, 0.7ms postprocess per image at shape (1, 3, 640, 544)
5

image 1/1 /Users/etowah/projects/PMC-figure-downloader/img/10937393_Fig2.jpg: 640x608 5 labels, 81.2ms
Speed: 2.3ms preprocess, 81.2ms inference, 0.6ms postprocess per image at shape (1, 3, 640, 608)
6

image 1/1 /Users/etowah/projects/PMC-figure-downloader/img/10937393_Fig3.jpg: 576x640 7 labels, 82.5ms
Speed: 2.3ms preprocess, 82.5ms inference, 0.7ms postprocess per image at shape (1, 3, 576, 640)
Subfigure count mismatch: 7 detected, 6 expected
3

image 1/1 /Users/etowah/projects/PMC-figure-downloader/img/10937393_Fig4.jpg: 640x640 4 labels, 104.3ms
Speed: 2.3ms preprocess, 104.3ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 640)
4

image 1/1 /Users/etowah/projects/PMC-figure-downloader/img/10937393_Fig5.jpg: 640x544 5 labels, 78.5ms
Speed: 2.0ms preprocess, 78.5ms