In [11]:
import subprocess
from functools import lru_cache

In [24]:
@lru_cache(maxsize=1000000)
def get_directory_size_ssh(host, user, path, port=22):
    cmd = f'ssh -p {port} {user}@{host} "du -sb {path} | cut -f1"'
    result = subprocess.check_output(cmd, shell=True)
    return int(result.strip())

In [25]:
PATH1='/volume1/homes/eric/_Documents/_Records/_By\ Year/2020/Factual/Laptop\ Dump/code'
HOST1='192.168.2.2'
PORT1=8932
USER1='eric'

PATH2=PATH1
HOST2='192.168.2.6'
PORT2=PORT1
USER2=USER1

In [26]:
def print_size(size_byte):
    print(f"Total size: {size_bytes / (1024 ** 3):.2f} GB ({size_bytes:,} bytes)")

In [27]:
size_bytes = get_directory_size_ssh(HOST1, USER1, PATH1, port=PORT1)
print_size(size_bytes)

Total size: 20.66 GB (22,180,776,083 bytes)


In [28]:
size_bytes = get_directory_size_ssh(HOST2, USER2, PATH2, port=PORT2)
print_size(size_bytes)

Total size: 16.22 GB (17,420,762,172 bytes)


If the directory sizes are different, it could be because
* the smaller directory is missing files or directories the other has
* the smaller directory has all the files and directories the other has, but one of the subdirs is different

Strategy:
If dirA and dirB are the same:
* return "the two directories are the same!" (print that message and return None)

If dirA and dirB are different,
* list all files in dirA and dirB
* compare file list + sizes in dirA and dirB, report differences
* list all dirs in dirA and dirB
  * If there are different dir names in dirA and dirB, report differences
  * for dir names that match:
    * recurse
  

In [36]:
def list_files_with_sizes_ssh(host, user, path, port=22):
    cmd = (
        f'ssh -p {port} {user}@{host} '
        f'"find {path} -maxdepth 1 -type f -printf \\"%p|%s\\\\n\\""'
    )

    result = subprocess.check_output(cmd, shell=True, text=True)
    files = []
    for line in result.strip().split('\n'):
        try:
            filepath, size_str = line.split('|')
            files.append((filepath, int(size_str)))
        except ValueError as e:
            print(f"*** ERROR: {e}")
            contine

    return set(files)

In [37]:
files1 = list_files_with_sizes_ssh(HOST1, USER1, PATH1, PORT1)

In [38]:
files2 = list_files_with_sizes_ssh(HOST2, USER2, PATH2, PORT2)

In [39]:
files2

{('/volume1/homes/eric/_Documents/_Records/_By Year/2020/Factual/Laptop Dump/code/#exits.py#',
  3),
 ('/volume1/homes/eric/_Documents/_Records/_By Year/2020/Factual/Laptop Dump/code/.DS_Store',
  16388),
 ('/volume1/homes/eric/_Documents/_Records/_By Year/2020/Factual/Laptop Dump/code/LIAsourcecode.zip',
  64107235),
 ('/volume1/homes/eric/_Documents/_Records/_By Year/2020/Factual/Laptop Dump/code/audiene.tar',
  205117952),
 ('/volume1/homes/eric/_Documents/_Records/_By Year/2020/Factual/Laptop Dump/code/csschallenge.zip',
  8343946),
 ('/volume1/homes/eric/_Documents/_Records/_By Year/2020/Factual/Laptop Dump/code/insights-ui.tar',
  124763648)}

In [40]:
files1

{('/volume1/homes/eric/_Documents/_Records/_By Year/2020/Factual/Laptop Dump/code/#exits.py#',
  3),
 ('/volume1/homes/eric/_Documents/_Records/_By Year/2020/Factual/Laptop Dump/code/.DS_Store',
  16388),
 ('/volume1/homes/eric/_Documents/_Records/_By Year/2020/Factual/Laptop Dump/code/LIAsourcecode.zip',
  64107235),
 ('/volume1/homes/eric/_Documents/_Records/_By Year/2020/Factual/Laptop Dump/code/audiene.tar',
  205117952),
 ('/volume1/homes/eric/_Documents/_Records/_By Year/2020/Factual/Laptop Dump/code/csschallenge.zip',
  8343946),
 ('/volume1/homes/eric/_Documents/_Records/_By Year/2020/Factual/Laptop Dump/code/insights-ui.tar',
  124763648)}

In [41]:
files1 - files2

set()

In [42]:
files2 - files1

set()

In [43]:
files1.add(('bad', 42))

In [44]:
files1 - files2

{('bad', 42)}

In [45]:
if files1 - files2:
    print(f"files in 1 but not 2: {files1 - files2}")

files in 1 but not 2: {('bad', 42)}
