#Fingerprinter.ipynb v1.0

#What is this?
This is a computer program that takes the "fingerprint" of a text file that you give it. The purpose is to make it easier to spot tampering. A tiny change in the text will cause the fingerprint to be obviously different.

#How do I use this on a desktop?
1. Save your document as a .txt file - for example in Google Docs, go to File > Download > Plain text (.txt)
2. Upload your .txt file by clicking the folder icon to the left, then the upload icon.  
3. Hover your mouse just to the left of the cell containing "!pip install..." You should see a "play" icon. Click this. Some setup happens automatically for a couple of minutes.
4. Do the same thing in the next cell. Nothing will happen except you should see a little green check mark in a few seconds. This is more automatic setup.
5. In the next cell, set the `input_path` to the name of the file you uploaded.
6. Now click the play icon to the left of the cell you just edited.
7. You should see a message containing the fingerprint.
8. Copy-paste that output message into the end of your document.
9. If you make revisions to your document, don't worry about removing the fingerprint before repeating the above steps - however, the result of this script will change, so you'll need to overwrite the old fingerprint with the updated one.

#What about a mobile device?
1. It is basically the same as above, but more awkward. Chrome on an ipad should work ok.

#Is it perfect?
No. Some text changes will not register and the fingerprint won't change; this is on purpose because sometimes web browsers make small changes when copy-pasting text between various apps, and we do not want those to confuse us. It ignores the numbers in numbered lists. Plus whatever bugs I'm unaware of. But as long as we run the same version of the script, we'll get the same answer for the same input.

#This code is doing lots of substitutions of special characters. What if I have math equations, e.g. an accounting formula, in my document?
In that case it is better to make the equation in something like Microsoft Equation Editor, take a screenshot, upload it to a draft of your reddit post, then download it (reddit does some image processing that usually changes the fingerprint of whatever image you upload), then use this script to take the hash of all those equation images, and add those hashes to your main document as image captions using the reddit post editor. See the `process_directory` function in the code below.

In [None]:
!pip install python-docx odfpy beautifulsoup4 pillow striprtf

In [131]:
import os
import sys
import hashlib
import re
from pathlib import Path
from docx import Document
from lxml import etree
from odf.opendocument import load as load_odt
from odf.text import P, H, Span, List, ListItem, Note, NoteBody
from odf.element import Text
from bs4 import BeautifulSoup
from PIL import Image
from striprtf.striprtf import rtf_to_text

VERSION = "1.0"

# Function to read a txt file
def read_txt(file_path):
    with open(file_path, 'r', encoding='utf-8-sig') as file:
        text = file.read()
    return text

# Function to normalize text
def normalize_text(text):
    # Remove item numbers from numbered lists
    text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)

    # Remove leading and trailing whitespace
    text = text.strip()

    # Define ligature replacements
    ligature_map = {
        'ﬁ': 'fi', 'ﬂ': 'fl', 'ﬃ': 'ffi', 'ﬄ': 'ffl', 'ﬀ': 'ff', 'æ': 'ae', 'œ': 'oe'
    }
    for ligature, replacement in ligature_map.items():
        text = text.replace(ligature, replacement)

    # Define confusable symbol replacements
    confusables_map = {
        '“': '"', '”': '"', '‘': "'", '’': "'", '–': '-', '—': '-', '…': '...',
        ' ': ' ', '−': '-', '×': 'x', '·': '.', '«': '"', '»': '"', '„': '"',
        '©': '(C)', '®': '(R)', '™': '(TM)'
    }
    for symbol, replacement in confusables_map.items():
        text = text.replace(symbol, replacement)

    # Remove all whitespaces, newlines, carriage returns
    text = re.sub(r'\s+', '', text)

    return text


# Function to compute SHA-256 checksum
def compute_sha256(data):
    sha256 = hashlib.sha256()
    sha256.update(data.encode('utf-8'))
    return sha256.hexdigest()

# Function to process input file
def process_file(file_path):
    ext = file_path.suffix.lower()
    if ext == '.txt':
        text = read_txt(file_path)
    else:
        raise ValueError(f"Unsupported file format: {ext}")

    #print(f"### Printing `text` ###")
    #print(text)
    #print("### Done printing `text` ###")

    # Split the text to ignore the message and below
    start_ignore_marker = "---------(Fingerprinter script should ignore the following!)-----------"
    stop_ignore_marker = "---------(Stop ignoring now)-----------"

    parts = text.split(start_ignore_marker, 1)
    if len(parts) > 1:
        #print(f"### Printing `parts[0]` ###")
        #print(parts[0])
        #print("### Done printing `parts[0]` ###")
        #print(f"### Printing `parts[1]` ###")
        #print(parts[1])
        #print("### Done printing `parts[1]` ###")

        parts_ignored = parts[1].split(stop_ignore_marker, 1)
        if len(parts_ignored) > 1:
            text_to_normalize = parts[0] + parts_ignored[1]
        else:
            text_to_normalize = parts[0]
    else:
        text_to_normalize = text

    #print(text_to_normalize)
    #print("### Done printing `text_to_normalize`###")

    normalized_text = normalize_text(text_to_normalize)
    checksum = compute_sha256(normalized_text)
    message = f"""---------(Fingerprinter script should ignore the following!)-----------
The fingerprint (SHA-256 checksum) of the above text (with whitespace, special characters, and this message removed) is:
{checksum}
(Generated by Fingerprinter.ipynb version {VERSION})
https://colab.research.google.com/drive/1nPrdk4w_sl6zjJVWP-DS7HKIyuIth6KN?usp=sharing
---------(Stop ignoring now)-----------"""
    print(message)


# Function to process directory of .webp files
def process_directory(directory_path):
    for file_path in directory_path.glob('*.webp'):
        with open(file_path, 'rb') as file:
            data = file.read()
            checksum = hashlib.sha256(data).hexdigest()
            print(f"SHA-256 checksum for {file_path}: {checksum}")



In [None]:
input_path = "Replace me with the name of your file.txt"

input_path = Path(input_path)
if input_path.is_file():
    process_file(input_path)
elif input_path.is_dir():
    process_directory(input_path)
else:
    raise ValueError(f"Invalid input path: {input_path}")