In [1]:
from datetime import datetime
import layoutparser as lp
import cv2
import os
import json
import random
from matplotlib import pyplot as plt

In [2]:
base_path = "../phishing_websites/legit_database_files"
model_prima = lp.Detectron2LayoutModel('lp://PrimaLayout/mask_rcnn_R_50_FPN_3x/config',
                                       extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.4],
                                       label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"})

  from .autonotebook import tqdm as notebook_tqdm
The checkpoint state_dict contains keys that are not used by the model:
  [35mpixel_mean[0m
  [35mpixel_std[0m
  [35mproposal_generator.anchor_generator.cell_anchors.{0, 1, 2, 3, 4}[0m


In [3]:
def remove_is_in(layout, center=True):
    lay = lp.Layout()

    for element in layout:
        inside = False
        for e in lay:
            if e is element:
                inside = True
                break
            if element.block.x_1 >= e.block.x_1 and element.block.y_1 >= e.block.y_1 and element.block.x_2 <= e.block.x_2 and element.block.y_2 <= e.block.y_2:
                inside = True
                break
        if not inside:
            lay += [element]
    return lay


def remove_big_small(layout, img_size):
    lay = lp.Layout()
    for element in layout:
        area = (element.area / img_size)
        if 0.5 > area > 0.01:
            lay += [element]
    return lay

def is_in_line(layout, im_height):
    lay = lp.Layout()

    for e in layout:
        y_center = e.block.center[1]
        found = False
        for elem in lay:
            if abs(y_center - elem.block.center[1]) < (im_height*0.05):
                found = True
                if e.block.x_1 < elem.block.x_1:
                    elem.block.x_1 = e.block.x_1
                if e.block.x_2 > elem.block.x_2:
                    elem.block.x_2 = e.block.x_2
                if e.block.y_1 < elem.block.y_1:
                    elem.block.y_1 = e.block.y_1
                if e.block.y_2 > elem.block.y_2:
                    elem.block.y_2 = e.block.y_2
        if not found:
            lay.insert(0, e)
    return lay.sort(key=lambda x: x.area, reverse=True)


def is_in_row(layout, im_width):
    lay = lp.Layout()

    for e in layout:
        x_center = e.block.center[0]
        found = False
        for elem in lay:
            if abs(x_center - elem.block.center[0]) < (im_width*0.05):
                found = True
                if e.block.y_1 < elem.block.y_1:
                    elem.block.y_1 = e.block.y_1
                if e.block.y_2 > elem.block.y_2:
                    elem.block.y_2 = e.block.y_2
                if e.block.x_1 < elem.block.x_1:
                    elem.block.x_1 = e.block.x_1
                if e.block.x_2 > elem.block.x_2:
                    elem.block.x_2 = e.block.x_2
        if not found:
            lay.insert(0, e)
    return lay.sort(key=lambda x: x.area, reverse=True)

## Start

In [13]:
base_path = "../PHISHPEDIA/Phishpedia/phishpedia/src/siamese_pedia/expand_targetlist_v2"
image = cv2.imread(f"{base_path}/Blockchain/loginpage.png")
image = image[..., ::-1]
image_size = image.shape[0] * image.shape[1]

lay = model_prima.detect(image)

In [14]:
lines = is_in_line(lay.sort(key=lambda x: x.block.center[1]), image.shape[0])
rows = is_in_row(lay.sort(key=lambda x: x.block.center[0]), image.shape[1])

l = remove_big_small(lines, image_size)
r = remove_big_small(rows, image_size)
#lr = (l + r).sort(key=lambda x: x.area, reverse=True)
lr = remove_is_in((l + r).sort(key=lambda x: x.area, reverse=True))

In [15]:
#lr = lr.to_dataframe().loc[:, ["x_1", "x_2", "y_1", "y_2"]].to_numpy()[1]
#l = lp.Layout()
#l.insert(0, lr[1])
#lr = l
#lr

In [16]:
#lines.to_dataframe().loc[:, ["x_1", "x_2", "y_1", "y_2"]].to_numpy()

In [17]:
image_lay = lp.draw_box(image, lay, box_width=3)
image_1 = lp.draw_box(image, l, box_width=3)
image_2 = lp.draw_box(image, r, box_width=3)
image_end = lp.draw_box(image, lr.pad(8,8,8,8), box_width=3)

In [18]:
image_lay.save(f"a.png")
image_1.save(f"a1.png")
image_2.save(f"a2.png")
image_end.save(f"b.png")

In [10]:
lay.to_dataframe().loc[:, ["x_1", "x_2", "y_1", "y_2"]].to_numpy()
for t in lay:
    print(t.block.center[1])

387.6941909790039
447.65208435058594
343.41664123535156
33.39764881134033
303.68772888183594
198.3244171142578
171.96878051757812
173.1952133178711
447.65208435058594
34.00912952423096
181.3691864013672
550.1845397949219
184.6432876586914
224.0603380203247
408.82460021972656
34.00912952423096
351.28662109375


In [11]:
for t in lines:
    print(t.block.center[1])

550.1845397949219
34.00912952423096
184.6432876586914
408.82460021972656
303.68772888183594
343.41664123535156
447.65208435058594
387.6941909790039


In [12]:
for t in lay:
    print(t)

TextBlock(block=Rectangle(x_1=263.12152099609375, y_1=164.3883819580078, x_2=447.77716064453125, y_2=611.0), text=None, id=None, type=Title, parent=None, next=None, score=0.9997488856315613)
TextBlock(block=Rectangle(x_1=472.1885986328125, y_1=443.88824462890625, x_2=644.8549194335938, y_2=451.4159240722656), text=None, id=None, type=Title, parent=None, next=None, score=0.9996209144592285)
TextBlock(block=Rectangle(x_1=287.9779968261719, y_1=335.2950744628906, x_2=650.5047607421875, y_2=351.5382080078125), text=None, id=None, type=Title, parent=None, next=None, score=0.9995309114456177)
TextBlock(block=Rectangle(x_1=718.0853881835938, y_1=30.465646743774414, x_2=895.0311279296875, y_2=36.32965087890625), text=None, id=None, type=Title, parent=None, next=None, score=0.9990813732147217)
TextBlock(block=Rectangle(x_1=280.46502685546875, y_1=292.6479797363281, x_2=628.3392333984375, y_2=314.72747802734375), text=None, id=None, type=Title, parent=None, next=None, score=0.9951797723770142)
T