# CMFD Evaluation Notebook

This notebook evaluate the CMFD method of [1] using two approaches, with some modification provided by Prof. Verdoliva et al. to scientific figures:
1. Zernike Features with PatchMatch [1]
2. Fusion of Zernike and Sift Features with PatchMatch


We run the approaches above on 3 different datasets
1. Figures cited on a retraction notice with suspicious manipulation within a single figure
2. Random common creative figures from PubMed Central® (PMC) -- without any retraction associated with them (until end of 2020)
3. Pair of figures cited on a retraction notice with suspicious manipulation across the pair.
    
### Reference
[1] D.Cozzolino, G.Poggi, L.Verdoliva, ''Efficient dense-field copy-move forgery detection'', IEEE TIFS 2015.

# Requirements
cairosvg  --> pip install cairosvg

## Experiment Setup

In [9]:
import json, os, shutil
from cairosvg import svg2png

In [2]:
# Organize all retracted figures with suspicious of manipulation within a single figure
with open('dataset_tasks/cmfd/cmfd-hit-or-miss-single.json','r') as js:
    retracted_hit_or_miss_setup = json.load(js)

for case, item in retracted_hit_or_miss_setup.items():
    
    case = case.replace('case-','')
    os.makedirs(f'cmfd_exp/retracted_single_figures/{case}',exist_ok=True)

    # copy fig to the env
    src_file = item['figure-path']
    dest_file = f'cmfd_exp/retracted_single_figures/{case}/{os.path.basename(src_file)}'
    shutil.copy(src_file,dest_file)
     
    

In [3]:
# Organize all random selected figures from PMC
with open('dataset_tasks/cmfd/cmfd-hit-or-miss-pristine.json','r') as js:
    pmc_hit_or_miss_setup = json.load(js)

for case, item in pmc_hit_or_miss_setup.items():
    
        case = case.replace('pristine-case-','')
        os.makedirs(f'cmfd_exp/pmc_random_selected/{case}',exist_ok=True)

        # copy fig to the env
        src_file = item['figure-path']
        dest_file = f'cmfd_exp/pmc_random_selected/{case}/{os.path.basename(src_file)}'
        shutil.copy(src_file,dest_file)
     

In [4]:
# Organize all retracted figures with suspicious of manipulation across a pair of figures
with open('dataset_tasks/cmfd/cmfd-hit-or-miss-pair.json','r') as js:
    retracted_pair_setup = json.load(js)

for case, item in retracted_pair_setup.items():
    
        case = case.replace('case-','')
        os.makedirs(f'cmfd_exp/retracted_pair_figures/{case}',exist_ok=True)

        # copy fig-1 to the env
        src_file = item['figure-1-path']
        ext = src_file[src_file.rfind(".")+1:]
        # To avoid overwrite the figs ( case that fig1 and fig2 have the same name)
        # we are saving them with the name fig1 and fig2
        dest_file = f'cmfd_exp/retracted_pair_figures/{case}/fig1.{ext}'
        shutil.copy(src_file,dest_file)
        
        # copy fig-2 to the env
        src_file = item['figure-2-path']
        ext = src_file[src_file.rfind(".")+1:]
        dest_file = f'cmfd_exp/retracted_pair_figures/{case}/fig2.{ext}'
        shutil.copy(src_file,dest_file)
     

**Import Cell**

In [11]:
from glob import glob
import tempfile
import os
import subprocess
from tqdm import tqdm
import shutil
import cv2
from PIL import Image
import numpy as np
import h5py
LIB_PATH = "unina_forensic_matlab"
os.environ['LD_LIBRARY_PATH'] = os.path.realpath("unina_forensic_matlab/dependencies")

def convertsvg2png(svgfile):
    file = svgfile[:svgfile.rfind('.')]
    svg2png(url=svgfile,write_to=f"{file}.png")
    return file+".png"

def zernikePM_on_single_fig(img, tmpdir):
    cmd = f"docker run -v {tmpdir}:/data pm-sci-int data/{img} data/out.png"
    return subprocess.run(cmd, shell=True)


def zernikePM_on_pair_figs(fig1, fig2, tmpdir):
    cmd = f"docker run -v {tmpdir}:/data pm-sci-int data/{fig1} data/{fig2} data/out.png"
    return subprocess.run(cmd, shell=True)

def cmfdZernikePM_on_single(dataset):
    """
    Execute the CMFD method on single figure cases
    """
    
    # Execute the method for each image of the dataset
    for fig in tqdm(dataset):
        # Create a temporary directory
        with tempfile.TemporaryDirectory() as tmpdir:
            
            # If file is svg, convert to png
            if fig.endswith('.svg'):
                fig = convertsvg2png(fig)
            # Copy the img case to tempdir
            src = fig
            dest = f'{tmpdir}/{os.path.basename(fig)}'
            shutil.copy(src,dest)
            # Execute the CMFD method
            return_exec = zernikePM_on_single_fig(os.path.basename(fig), tmpdir)
    
            # Copy results to the original dataset location at a output dir
            outdir = os.path.dirname(src) 

            outdir = outdir+ "/zernikePM"
            
            os.makedirs(outdir,exist_ok=True)
            dest = os.path.basename(src)

            dest = "".join(dest.split(".")[:-1])
            shutil.copy(f'{tmpdir}/out.png', f'{outdir}/{dest}_out.png')
            shutil.copy(f'{tmpdir}/out_col.png', f'{outdir}/{dest}_out_col.png')
            
def cmfdZernikePM_on_pair(dataset):
    """
    Execute the CMFD method on a pair of figures for each case from the dataset
    """
    # Execute the method for each pair of image case on the dataset
    for pair in tqdm(dataset):
        # Create a temporary directory
        with tempfile.TemporaryDirectory() as tmpdir:
            # Copy the pair of images to tempdir
            fig1,fig2 = pair
            
            # If file is svg, convert to png
            if fig1.endswith('.svg'):
                fig1 = convertsvg2png(fig1)
            
            src_fig1 = fig1
            dest_fig1 = f'{tmpdir}/{os.path.basename(fig1)}'
            shutil.copy(src_fig1,dest_fig1)
            
            # If file is svg, convert to png
            if fig2.endswith('.svg'):
                fig2 = convertsvg2png(fig2)
            
            src_fig2 = fig2
            dest_fig2 = f'{tmpdir}/{os.path.basename(fig2)}'
            shutil.copy(src_fig2,dest_fig2)
            
            # Execute the CMFD method
            return_exec = zernikePM_on_pair_figs(os.path.basename(fig1),os.path.basename(fig2), tmpdir)
            
            # Copy result maps to the original dataset location on an output directory
            outdir = os.path.dirname(fig1) 
            
            outdir = outdir+ "/zernikePM"
                
            os.makedirs(outdir,exist_ok=True)

            dest_fig1 = os.path.basename("".join(dest_fig1.split(".")[:-1]))
            dest_fig2 = os.path.basename("".join(dest_fig2.split(".")[:-1]))
            
            shutil.copy(f'{tmpdir}/out_A.png', f'{outdir}/{dest_fig1}_out.png')
            shutil.copy(f'{tmpdir}/out_A_col.png', f'{outdir}/{dest_fig1}_out_col.png')
            shutil.copy(f'{tmpdir}/out_B.png', f'{outdir}/{dest_fig2}_out.png')
            shutil.copy(f'{tmpdir}/out_B_col.png', f'{outdir}/{dest_fig2}_out_col.png')

##################################################
#####             USING Fusion           #########
##################################################
def create_result_map(img_map):
    resultMap = np.zeros((img_map.shape[0],img_map.shape[1]))
    for i in range(img_map.shape[2]):
        label = i+1
        resultMap[img_map[:,:,i] == 1] = label
    return resultMap
    

def execute_fusion_on_pair(filename_imgA, filename_imgB):
    
    if os.path.isdir('tmp'):
         shutil.rmtree('tmp')
    os.makedirs("tmp",exist_ok=True)
    
    if os.path.isdir('siftFeats'):
        shutil.rmtree('siftFeats')
    os.makedirs("siftFeats",exist_ok=True)
    
    with tempfile.NamedTemporaryFile('w+b', suffix='.hdf5') as tmpfile:
            cmd = "matlab -nodisplay -r \"run('%s/addpath_sift_forensic.m');try main_keypoint_based('%s','%s','%s'); catch; end; quit\""%(LIB_PATH, filename_imgA, filename_imgB, tmpfile.name)
            subprocess.run(cmd, shell=True)
            try:
                with h5py.File(tmpfile.name, 'r') as f:
                    mapA = f['/mapA'][()]
                    mapB = f['/mapB'][()]
                    num_gt = int(f['/num_gt'][()])
            # If the method fails
            except:
                num_gt = 0
    shutil.rmtree('tmp')
    shutil.rmtree('siftFeats')
    if num_gt>0:
        if mapA.ndim==2:
            mapA = mapA[None,:,:]
            mapB = mapB[None,:,:]
        mapA = np.transpose(mapA,(2,1,0)).copy()
        mapB = np.transpose(mapB,(2,1,0)).copy()
        
        resultA = create_result_map(mapA)
        resultB = create_result_map(mapB)
        
        return  resultA, resultB
    
    # return zeros maps
    imgA = Image.open(filename_imgA)
    imgB = Image.open(filename_imgB)
    
    resultA = np.zeros((imgA.height,imgA.width))
    resultB = np.zeros((imgB.height,imgB.width))
    return resultA, resultB


def execute_fusion_on_single(filename_imgA):
    # Create auxiliary dirs in which the matlab will use to save the sift features and related files
    if os.path.isdir('tmp'):
         shutil.rmtree('tmp')
    os.makedirs("tmp",exist_ok=True)
    
    if os.path.isdir('siftFeats'):
        shutil.rmtree('siftFeats')
    os.makedirs("siftFeats",exist_ok=True)
    
    with tempfile.NamedTemporaryFile('w+b', suffix='.hdf5') as tmpfile:
            cmd = "matlab -nodisplay -r \"run('%s/addpath_sift_forensic.m');try main_cmfd_keypoint_based('%s','%s'); catch; end; quit\""%(LIB_PATH, filename_imgA, tmpfile.name)
            subprocess.run(cmd, shell=True)
            try:
                with h5py.File(tmpfile.name, 'r') as f:
                    mapA = f['/mapA'][()]
                    num_gt = int(f['/num_gt'][()])
            # If the method fails
            except:
                num_gt = 0
    shutil.rmtree('tmp')
    shutil.rmtree('siftFeats')

    if num_gt>0:
        if mapA.ndim==2:
            mapA = mapA[None,:,:]
        mapA = np.transpose(mapA,(2,1,0)).copy()
        
        resultA = create_result_map(mapA)
        
        return  resultA
    
    # return zeros maps
    imgA = Image.open(filename_imgA)
    
    resultA = np.zeros((imgA.height,imgA.width))
    return resultA

def cmfdFusionPM_on_single(dataset):
    """
    Execute the CMFD method on single figure cases
    """
    # Execute the method for each image case of the dataset
    for fig in tqdm(dataset):
        
        # If file is svg, convert to png
        if fig.endswith('.svg'):
            fig = convertsvg2png(fig)
        
        # Create a temporary directory
        figMap = execute_fusion_on_single(fig)
        
        figMapImg = Image.fromarray(figMap).convert("L")
        
        # Create outdir
        outdir = os.path.dirname(fig) 
        os.makedirs(f"{outdir}/fusionPM/", exist_ok=True)
                
        # Save labeled maps
        # A map in which the background is zero and each detected region is labeled with its match with an ID, where ID in [1,inf]
        dest = os.path.basename(fig)
        dest = "".join(dest.split(".")[:-1])
        dest_fig = f'{outdir}/fusionPM/{dest}_out_labeled.png'
        figMapImg.save(dest_fig)
        
        # Save mapes without label
        fig_output = np.ones_like(figMap)*255
        fig_output[figMap>0] = 0
        
        figMapImg = Image.fromarray(fig_output).convert("L")
        # Save unlabeled maps
        dest_fig = f'{outdir}/fusionPM/{dest}_out.png'

        figMapImg.save(dest_fig)
    
def cmfdFusionPM_on_pair(dataset):
    """
    Execute the CMFD method on multi figure cases
    """
    # Execute the method for each pair of image case on the dataset
    for pair in tqdm(dataset):
        # Create a temporary directory
        fig1,fig2 = pair
        
        # If file is svg, convert to png
        if fig1.endswith('.svg'):
            fig1 = convertsvg2png(fig1)
        if fig2.endswith('.svg'):
            fig2 = convertsvg2png(fig2)
        
        fig1Map, fig2Map = execute_fusion_on_pair(fig1,fig2)
        
        fig1MapImg = Image.fromarray(fig1Map).convert("L")
        fig2MapImg = Image.fromarray(fig2Map).convert("L")
        
        # Create outdir
        outdir = os.path.dirname(fig1) 
        os.makedirs(f"{outdir}/fusionPM/", exist_ok=True)
                
        # Save labeled maps
        # A map in which the background is zero and each detected region is labeled with its match with an ID, where ID in [1,inf]
        dest_fig1 = f"{outdir}/fusionPM/fig1_out_labeled.png"
        dest_fig2 = f"{outdir}/fusionPM/fig2_out_labeled.png"
        fig1MapImg.save(dest_fig1)
        fig2MapImg.save(dest_fig2)
        
        # Save mapes without label
        fig1_output = np.ones_like(fig1Map)*255
        fig1_output[fig1Map>0] = 0
        
        fig2_output = np.ones_like(fig2Map)*255
        fig2_output[fig2Map>0] = 0
        
        fig1MapImg = Image.fromarray(fig1_output).convert("L")
        fig2MapImg = Image.fromarray(fig2_output).convert("L")
        # Save unlabeled maps
        dest_fig1 = f"{outdir}/fusionPM/fig1_out.png"
        dest_fig2 = f"{outdir}/fusionPM/fig2_out.png"
        fig1MapImg.save(dest_fig1)
        fig2MapImg.save(dest_fig2)

# Running methods


### Single Figure

In [14]:
# Collect Single Figure from Retracted papers
retracted_single_figures = []
for case, item in retracted_hit_or_miss_setup.items():
    case = case.replace('case-','')
    retracted_single_figures.append(f"cmfd_exp/retracted_single_figures/{case}/{os.path.basename(item['figure-path'])}")
        
retracted_single_figures.sort()


# Apply methods
cmfdZernikePM_on_single(retracted_single_figures) 
cmfdFusionPM_on_single(retracted_single_figures) 

100%|██████████| 155/155 [1:19:46<00:00, 30.88s/it] 
100%|██████████| 155/155 [54:06<00:00, 20.95s/it] 


## Random PMC Papers ('pristines')

In [15]:
# Collect Single Figure from Random Selected Pmc Papers
random_selected_pmc_figures = []
for case, item in pmc_hit_or_miss_setup.items():
    case = case.replace('pristine-case-','')
    random_selected_pmc_figures.append(f"cmfd_exp/pmc_random_selected/{case}/{os.path.basename(item['figure-path'])}")
        
random_selected_pmc_figures.sort()
# Apply methods
cmfdZernikePM_on_single(random_selected_pmc_figures) 
cmfdFusionPM_on_single(random_selected_pmc_figures) 

100%|██████████| 332/332 [2:57:03<00:00, 32.00s/it]    
100%|██████████| 332/332 [2:08:23<00:00, 23.20s/it]   


### Run method on pair of issued Figures

In [None]:
# Collect Multi figure dataset
retracted_pair_figures = []


for case, item in retracted_pair_setup.items():
    
        case = case.replace('case-','')
        
        fig1 = item['figure-1-path']
        ext = fig1[fig1.rfind(".")+1:]
        fig1 = f'cmfd_exp/retracted_pair_figures/{case}/fig1.{ext}'

        fig2 = item['figure-2-path']
        ext = fig2[fig2.rfind(".")+1:]
        fig2 = f'cmfd_exp/retracted_pair_figures/{case}/fig2.{ext}'
         
        retracted_pair_figures.append((fig1,fig2))
    
cmfdZernikePM_on_pair(retracted_pair_figures)
cmfdFusionPM_on_pair(retracted_pair_figures)

 19%|█▊        | 17/91 [18:08<1:16:56, 62.38s/it]

# Evaluation - HIT or MISS
To evaluate the method we use a Hit or miss approach.

Bellow we describe how we measure each scenario of the dataset:
**HIT:**
- Retracted single figure:

    We consider a hit if there is any activate on the output map


- Retracted pair of figures:

    We consider a hit only if both output maps of each input image were activated


- PMC randomly selected figures ( with no retraction associated)

    We consider a hit if none region on the output map was activated

**Evaluation functions**

In [18]:
def evaluate_single_figure(dataset_output,pristine=False):
    score = []
    for result  in dataset_output:
        img = cv2.imread(result,cv2.IMREAD_GRAYSCALE)
        img[img>0] = 255
        img = cv2.bitwise_not(img)
        # Check the output map of the case
        img[img>0] = 1
        if img.sum() > 10:
            if pristine:
                score.append(0)
            else:
                score.append(1)
        else:
            if pristine:
                score.append(1)
            else:
                score.append(0)
    # Print result
    print(f"{sum(score)} / {len(score)}")
    print(f"{sum(score) / len(score)}")
    
    
def evaluate_pair_figures(dataset_output_pairs):
    score = []
    for result  in dataset_output_pairs:
        fig1,fig2 = result

        fig1_img = cv2.imread(fig1,cv2.IMREAD_GRAYSCALE)
        fig1_img[fig1_img>0] = 255
        fig1_img = cv2.bitwise_not(fig1_img)
        fig1_img[fig1_img>0] = 1
        
        fig2_img = cv2.imread(fig2,cv2.IMREAD_GRAYSCALE)
        fig2_img[fig2_img>0] = 255
        fig2_img = cv2.bitwise_not(fig2_img)
        fig2_img[fig2_img>0] = 1
        
        if fig1_img.sum() > 10 and fig2_img.sum()  > 10:
            score.append(1)
        else:
            score.append(0)
    print(f"{sum(score)} / {len(score)}")
    print(f"{sum(score) / len(score)}")

### Single Figure from Retracted papers

In [19]:
# Collect Single Figure from Retracted papers
retracted_single_figures_zernike = []
retracted_single_figures_fusion = []
for case, item in retracted_hit_or_miss_setup.items():
    case = case.replace('case-','')
    fig_input = os.path.basename(item['figure-path'])
    fig_input = fig_input[:fig_input.rfind(".")]

    zernike_output = f"cmfd_exp/retracted_single_figures/{case}/zernikePM/{fig_input}_out.png" 
    if not os.path.isfile(zernike_output):
        raise FileNotFoundError(zernike_output)
    retracted_single_figures_zernike.append(zernike_output)

    fusion_output = f"cmfd_exp/retracted_single_figures/{case}/fusionPM/{fig_input}_out.png" 
    if not os.path.isfile(fusion_output):
        raise FileNotFoundError(fusion_output)
    retracted_single_figures_fusion.append(fusion_output)
        
retracted_single_figures_fusion.sort()
retracted_single_figures_zernike.sort()

# Evaluate Patch Match with zernike features
print("Zernike PM: ")
evaluate_single_figure(retracted_single_figures_zernike, pristine=False) 

# Evaluate Patch Match with fusion of Sift and Zernike
print("Fusion PM: ")
evaluate_single_figure(retracted_single_figures_fusion, pristine=False) 


Zernike PM: 
114 / 155
0.7354838709677419
Fusion PM: 
139 / 155
0.896774193548387


### Single Figure from Random PMC selected papers (Pristine)

In [20]:
# Collect Single Figure from Retracted papers
random_pmc_single_figures_zernike = []
random_pmc_single_figures_fusion = []
for case, item in pmc_hit_or_miss_setup.items():
    case = case.replace('pristine-case-','')
    fig_input = os.path.basename(item['figure-path'])
    fig_input = fig_input[:fig_input.rfind(".")]

    zernike_output = f"cmfd_exp/pmc_random_selected/{case}/zernikePM/{fig_input}_out.png" 
    if not os.path.isfile(zernike_output):
        raise FileNotFoundError(zernike_output)
    random_pmc_single_figures_zernike.append(zernike_output)

    fusion_output = f"cmfd_exp/pmc_random_selected/{case}/fusionPM/{fig_input}_out.png" 
    if not os.path.isfile(fusion_output):
        raise FileNotFoundError(fusion_output)
    random_pmc_single_figures_fusion.append(fusion_output)
        
random_pmc_single_figures_fusion.sort()
random_pmc_single_figures_zernike.sort()

# Evaluate Patch Match with zernike features
print("Zernike PM: ")
evaluate_single_figure(random_pmc_single_figures_zernike, pristine=True) 

# Evaluate Patch Match with fusion of Sift and Zernike
print("Fusion PM: ")
evaluate_single_figure(random_pmc_single_figures_fusion, pristine=True) 

Zernike PM: 
155 / 332
0.46686746987951805
Fusion PM: 
108 / 332
0.3253012048192771


### Retracted Pair of figures

In [21]:
retracted_pair_figures_zernike = []
retracted_pair_figures_fusion = []
for case, item in retracted_pair_setup.items():
    
        case = case.replace('case-','')
        os.makedirs(f'cmfd_exp/retracted_pair_figures/{case}',exist_ok=True)

        # fig-1 
        zernike1_output = f'cmfd_exp/retracted_pair_figures/{case}/zernikePM/fig1_out.png'
        fusion1_output = f'cmfd_exp/retracted_pair_figures/{case}/fusionPM/fig1_out.png'
        
        # fig-2 
        zernike2_output = f'cmfd_exp/retracted_pair_figures/{case}/zernikePM/fig2_out.png'
        fusion2_output = f'cmfd_exp/retracted_pair_figures/{case}/fusionPM/fig2_out.png'
        
        retracted_pair_figures_zernike.append((zernike1_output,zernike2_output))
        retracted_pair_figures_fusion.append((fusion1_output,fusion2_output))

retracted_pair_figures_zernike.sort()
retracted_pair_figures_fusion.sort()

# Evaluate Patch Match with zernike features
print("Zernike PM: ")
evaluate_pair_figures(retracted_pair_figures_zernike) 

# Evaluate Patch Match with fusion of Sift and Zernike
print("Fusion PM: ")
evaluate_pair_figures(retracted_pair_figures_fusion)


Zernike PM: 
14 / 91
0.15384615384615385
Fusion PM: 
29 / 91
0.31868131868131866
