In [121]:
import subprocess
import shlex
from functools import lru_cache

import os

In [113]:
@lru_cache(maxsize=1000000)
def get_directory_size(host, user, path, port=22):
    escaped_path = shlex.quote(path)
    cmd = f'ssh -p {port} {user}@{host} "du -sb {escaped_path} | cut -f1"'
    result = subprocess.check_output(cmd, shell=True)
    return int(result.strip())

In [146]:
# PATH1='/volume1/homes/eric/_Documents/_Records/_By Year/2020/Factual/Laptop Dump/code'
PATH1='/volume1/homes/eric/_Documents/_Records/_By Year/_2025'
HOST1='192.168.2.2'
PORT1=8932
USER1='eric'

PATH2=PATH1
HOST2='192.168.2.6'
PORT2=PORT1
USER2=USER1

In [115]:
def print_size(size_byte):
    print(f"Total size: {size_bytes / (1024 ** 3):.2f} GB ({size_bytes:,} bytes)")

In [116]:
size_bytes = get_directory_size(HOST1, USER1, PATH1, port=PORT1)
print_size(size_bytes)

Total size: 20.66 GB (22,180,776,083 bytes)


In [117]:
size_bytes = get_directory_size(HOST2, USER2, PATH2, port=PORT2)
print_size(size_bytes)

Total size: 16.22 GB (17,420,762,172 bytes)


If the directory sizes are different, it could be because
* the smaller directory is missing files or directories the other has
* the smaller directory has all the files and directories the other has, but one of the subdirs is different

Strategy:
If dirA and dirB are the same:
* return "the two directories are the same!" (print that message and return None)

If dirA and dirB are different,
* list all files in dirA and dirB
* compare file list + sizes in dirA and dirB, report differences
* list all dirs in dirA and dirB
  * If there are different dir names in dirA and dirB, report differences
  * for dir names that match:
    * recurse
  

In [143]:
def list_files_with_sizes(host, user, path, port=22):
    escaped_path = shlex.quote(path)
    cmd = (
        f'ssh -p {port} {user}@{host} '
        f'"find {escaped_path} -maxdepth 1 -type f -printf \\"%p|%s\\\\n\\""'
    )

    result = subprocess.check_output(cmd, shell=True, text=True)
    files = []
    for line in result.strip().split('\n'):
        if not line:
            continue # skip blanks
        try:
            filepath, size_str = line.split('|')
            files.append((filepath, int(size_str)))
        except ValueError as e:
            print(f"*** ERROR: {e}, {host=}, {path=}, {line=}, {files=}")
            raise e

    return set(files)

def list_dirs(host, user, path, port=22):
    escaped_path = shlex.quote(path)
    cmd = (
        f'ssh -p {port} {user}@{host} '
        f'"find {escaped_path} -maxdepth 1 -type d ! -path {escaped_path} -printf \\"%p\\\\n\\""'
    )

    result = subprocess.check_output(cmd, shell=True, text=True)
    dirs = []
    for line in result.strip().split('\n'):
        if not line:
            continue # skip blanks
        try:
            dirs.append(line)
        except ValueError as e:
            print(f"*** ERROR: {e}, line={line}")
            raise e

    return set(dirs)

def compare(host1, host2, path, user, port=22, level=0):
    print(f'{" "*level*2}Comparing {path}')
    size1 = get_directory_size(host1, user, path, port=port)
    size2 = get_directory_size(host2, user, path, port=port)
    if size1 == size2:
        return
    print(f'{" "*level*2}{path} sizes are not the same. {size1=} {size2=}')
    files1 = list_files_with_sizes(host1, user, path, port)
    files2 = list_files_with_sizes(host2, user, path, port)
    # files1.add(('bad1', 42))
    # files2.add(('bad2', 43))
    if files1 - files2:
        diff = {os.path.relpath(f[0], path) for f in files1 - files2}
        print(f'{" "*level*2}Files in 1 but not 2: {diff}')
    if files2 - files1:
        diff = {os.path.relpath(f[0], path) for f in files2 - files1}
        print(f'{" "*level*2}Files in 2 but no 1: {diff}')        
    dirs1 = list_dirs(host1, user, path, port)
    dirs2 = list_dirs(host2, user, path, port)
    # dirs1.add('baddir1')
    # dirs2.add('baddir2')
    if dirs1 - dirs2:
        diff = {os.path.relpath(d, path) for d in dirs1 - dirs2}
        print(f'{" "*level*2}Dirs in 1 but not 2: {diff}')
    if dirs2 - dirs1:
        diff = {os.path.relpath(d, path) for d in dirs2 - dirs1}
        print(f'{" "*level*2}Dirs in 2 but not 1: {diff}')    
    # print(f'{" "*level*2}{dirs1=}')
    for subdir in dirs1 & dirs2:
        compare(host1, host2, subdir, user, port, level+1)

In [147]:
compare(HOST1, HOST2, PATH1, USER1, port=8932)

Comparing /volume1/homes/eric/_Documents/_Records/_By Year/_2025
/volume1/homes/eric/_Documents/_Records/_By Year/_2025 sizes are not the same. size1=11110963567 size2=11111031483
  Comparing /volume1/homes/eric/_Documents/_Records/_By Year/_2025/Library
  /volume1/homes/eric/_Documents/_Records/_By Year/_2025/Library sizes are not the same. size1=1889920 size2=1891654
    Comparing /volume1/homes/eric/_Documents/_Records/_By Year/_2025/Library/@eaDir
    /volume1/homes/eric/_Documents/_Records/_By Year/_2025/Library/@eaDir sizes are not the same. size1=2887 size2=3983
  Comparing /volume1/homes/eric/_Documents/_Records/_By Year/_2025/@eaDir
  /volume1/homes/eric/_Documents/_Records/_By Year/_2025/@eaDir sizes are not the same. size1=1752 size2=4098
  Comparing /volume1/homes/eric/_Documents/_Records/_By Year/_2025/Reeves
  /volume1/homes/eric/_Documents/_Records/_By Year/_2025/Reeves sizes are not the same. size1=5559618 size2=5564566
    Comparing /volume1/homes/eric/_Documents/_Reco

KeyboardInterrupt: 