In [None]:
import pandas as pd

In [None]:
test_df = pd.read_csv('test.csv')

In [None]:
test_df.head()

In [None]:
sample_unit = test_df.iloc[[0]]

In [None]:
sample_unit

In [None]:
import os

os.path.isfile(sample_unit.loc[0, 'local_pdf_path'])

In [None]:
sample_unit.style

In [None]:
import numpy as np
from PIL import Image
import cv2
import imagehash
from skimage.metrics import structural_similarity as ssim
from pathlib import Path

def compute_feature_score(img1_path: str, img2_path: str) -> float:
    pil1 = Image.open(img1_path).convert('RGB')
    pil2 = Image.open(img2_path).convert('RGB')
    
    arr1 = np.array(pil1)
    arr2 = np.array(pil2)
    
    phash1 = imagehash.phash(pil1, hash_size=16)
    phash2 = imagehash.phash(pil2, hash_size=16)
    phash_score = max(0, 1 - (phash1 - phash2) / 64)
    
    size = (256, 256)
    ssim_arr1 = np.array(pil1.convert('L').resize(size))
    ssim_arr2 = np.array(pil2.convert('L').resize(size))
    ssim_score, _ = ssim(ssim_arr1, ssim_arr2, full=True)
    
    gray1 = cv2.cvtColor(arr1, cv2.COLOR_RGB2GRAY)
    gray2 = cv2.cvtColor(arr2, cv2.COLOR_RGB2GRAY)
    gray1 = cv2.resize(gray1, (512, 512))
    gray2 = cv2.resize(gray2, (512, 512))
    
    feature_ratio = 0.0
    try:
        sift = cv2.SIFT_create(nfeatures=500)
        kp1, des1 = sift.detectAndCompute(gray1, None)
        kp2, des2 = sift.detectAndCompute(gray2, None)
        
        if des1 is not None and des2 is not None and len(des1) >= 2 and len(des2) >= 2:
            FLANN_INDEX_KDTREE = 1
            index_params = dict(algorithm=FLANN_INDEX_KDTREE, trees=5)
            search_params = dict(checks=50)
            flann = cv2.FlannBasedMatcher(index_params, search_params)
            matches = flann.knnMatch(des1, des2, k=2)
            
            good = [m for m, n in matches if len([m,n]) == 2 and m.distance < 0.7 * n.distance]
            feature_ratio = len(good) / min(len(kp1), len(kp2))
    except:
        pass
    
    combined = 0.3 * phash_score + 0.3 * ssim_score + 0.4 * feature_ratio
    return combined, phash_score, ssim_score, feature_ratio

# Find the specific files
folder1 = Path('./poster_output/markdown/figures/')
folder2 = Path('./paper_output/markdown/figures/')

# Find figure_029
fig029 = folder2 / '37030_figure_029.png'

# Find page_007_figure_005
for f in folder1.iterdir():
    if 'page_007_figure_005' in f.name or 'figure_005' in f.name:
        print(f"Found poster fig: {f.name}")
        score, ph, ss, fr = compute_feature_score(str(f), str(fig029))
        print(f"  vs figure_029:")
        print(f"    Combined: {score:.4f}")
        print(f"    pHash: {ph:.4f}")
        print(f"    SSIM: {ss:.4f}")
        print(f"    Features: {fr:.4f}")

In [None]:
sample_unit_2 = test_df.iloc[[3]]

In [None]:
sample_unit_2.style

In [None]:
import pandas as pd

df = pd.read_csv('data/train.csv')

In [None]:
df.columns

In [None]:
sum(df['local_pdf_path'].isna())

In [None]:
df.shape