In [None]:
import glob
from pathlib import Path
from matplotlib import pyplot as plt

# Exploration of :garbage: PDFs

In [None]:
bad_reports = [
    "c5e25f90c7006546", # ANSSI spacing
    "03bce905b71945aa", # ANSSI spacing
    "3477723044183b31", # ANSSI empty?
    "7e32023021d5aad2", # empty?
    "4c9468f20fdb04f7", # empty?
    "82c24f729c2e0092", # ANSSI spacing
    "e1daa354ae5a61fd", # ANSSI spacing
    "c80801f9a71b030e", # ANSSI spacing
]

good_reports = [
    "2544ffa2d8eef431", # Japan, short but OK
    "a0aa53cad9c5d049", # Korea, OK, but low avg
    "10f1399a27470345", # Korea, OK, but low avg
    "60c49ab7f7d33501", # Korea, OK, but low avg
    "e133881d7203a6e4", # Spain, OK
    "4ff70fb16691d53c", # India, OK
]

In [None]:
def average_line_length(text: str) -> float:
    length = 0
    lines = 0
    for line in text.splitlines():
        length += len(line)
        lines += 1
    if lines:
        return length/lines
    else:
        return 0

def overall_size(text: str) -> float:
    return len(text)

def num_lines(text: str) -> float:
    return len(text.splitlines())

def every_second_char(text: str) -> float:
    c = 0
    for line in text.splitlines():
        if len(set(line[1::2])) > 1:
            c += 1
    return c

def alpha_chars(text: str) -> float:
    tl = len(text)
    if tl == 0:
        return 0
    return len("".join(filter(str.isalpha, text))) / tl

In [None]:
LINES_THRESHOLD = 30
SIZE_THRESHOLD = 1000
AVG_LLEN_THRESHOLD = 10
EVERY_SECOND_CHAR_THRESHOLD = 15
ALPHA_CHARS_THRESHOLD = 0.5

def garbage(text: str) -> bool:
    size = len(text)
    content_len = 0
    lines = 0
    every_second = 0
    alpha_len = len("".join(filter(str.isalpha, text)))
    for line in text.splitlines():
        content_len += len(line)
        lines += 1
        if len(set(line[1::2])) > 1:
            every_second += 1

    if lines:
        avg_line_len = content_len / lines
    else:
        avg_line_len = 0
    if size:
        alpha = alpha_len / size
    else:
        alpha = 0

    # If number of lines is small, this is garbage.
    if lines < LINES_THRESHOLD:
        return True
    # If the file size is small, this is garbage.
    if size < SIZE_THRESHOLD:
        return True
    # If the average length of a line is small, this is garbage.
    if avg_line_len < AVG_LLEN_THRESHOLD:
        return True
    # If there a small amount of lines that have more than one character at every second character, this is garbage.
    # This detects the ANSSI spacing issues.
    if every_second < EVERY_SECOND_CHAR_THRESHOLD:
        return True
    # If there is a small ratio of alphanumeric chars to all chars, this is garbage.
    if alpha < ALPHA_CHARS_THRESHOLD:
        return True
    return False


In [None]:
avgs = {}
sizes = {}
line_counts = {}
schars = {}
alphas = {}
bad = set()
for fname in glob.glob("../cc_dset/certs/reports/txt/*"):
    path = Path(fname)

    with path.open("r") as f:
        text = f.read()
    dgst = path.stem

    avg = average_line_length(text)
    size = overall_size(text)
    nlines = num_lines(text)
    schar = every_second_char(text)
    alpha = alpha_chars(text)

    avgs[dgst] = avg
    sizes[dgst] = size
    line_counts[dgst] = nlines
    schars[dgst] = schar
    alphas[dgst] = alpha

    if nlines < 30:
        print(f"{dgst}:  nlines: {nlines:.2f}")
        bad.add(dgst)
    if size < 1000:
        print(f"{dgst}:    size: {size:.2f}")
        bad.add(dgst)
    if avg < 10:
        print(f"{dgst}:     avg: {avg:.2f}")
        bad.add(dgst)
    if schar < 15:
        print(f"{dgst}:   schar: {schar:.2f}")
        bad.add(dgst)
    if alpha < 0.5:
        print(f"{dgst}:   alpha: {alpha:.2f}")
        bad.add(dgst)

In [None]:
print(len(bad))
print("                  nlines,   size,   lavg, schar, alpha")
for b in bad:
    print(f"{b}: {line_counts[b]:>6}, {sizes[b]:>7}, {avgs[b]:>5.02f}, {schars[b]:>5}, {alphas[b]:>5.02f}")
for b in bad_reports:
    print(b in bad)

for b in good_reports:
    print(b not in bad)

In [None]:
plt.hist(line_counts.values(), bins=30);

In [None]:
plt.hist(sizes.values(), bins=30);

In [None]:
plt.hist(avgs.values(), bins=30);

In [None]:
plt.hist(schars.values(), bins=30);

In [None]:
plt.hist(alphas.values(), bins=30);

In [None]:
for pdf_name in glob.glob("../cc_dset/certs/reports/pdf/*.pdf"):
    pdf_path = Path(pdf_name)
    dgst = pdf_path.stem

    txt_path = Path("../cc_dset/certs/reports/txt") / (dgst + ".txt")
    if not txt_path.exists():
        print(dgst)
