# Imports, constants & common functions

In [7]:
import pandas as pd
from pathlib import Path
import requests
import bs4 as bs
import time

In [16]:
# Configurable constants
CACHE_DIR = Path('./data')
OVERWRITE_CACHE = False
URL_FORMAT = 'https://fragment.com/username/{name}'
STATUS_SELECTOR = '#aj_content > main > section.tm-section.tm-auction-section > div.tm-section-header > h2 > span.tm-section-header-status'
PRICE_SELECTOR = '#aj_content > main > section.tm-section.clearfix > div.tm-table-wrap > table > tbody > tr > td:nth-child(1) > div > div'

# Calculated constants
CACHE_DIR.mkdir(exist_ok=True, parents=True)

# Demo of cache pattern

In [26]:
names_cache_file = CACHE_DIR / 'names.parquet'

if names_cache_file.is_file() and (not OVERWRITE_CACHE):
    print(f'Reading cached: {names_cache_file}')
    df = pd.read_parquet(names_cache_file)
else:
    names = ['Adam', 'Coty', 'Bob']
    records_accumulator = list()
    for n in names:
        response = requests.get(URL_FORMAT.format(name=n))
        soup = bs.BeautifulSoup(response.text, 'html')
        status_element = soup.select_one(STATUS_SELECTOR)
        if status_element is None:
            print(f'No status for {n}')
            continue
        status_text = status_element.text
        print(f'Status of "{n}":', status_text)
        time.sleep(0.5)
        records_accumulator.append(dict(name=n, status=status_text))
    df = pd.DataFrame(records_accumulator)
    print(f'Caching: {names_cache_file}')
    df.to_parquet(names_cache_file)

Reading cached: data/names.parquet


In [28]:
names = ['Adam', 'Coty', 'Bob']
names_ordered_and_normalised = sorted([n.lower() for n in names])
names_str = '_'.join(names_ordered_and_normalised)

names_cache_file = CACHE_DIR / f'names_{names_str}.parquet'

if names_cache_file.is_file() and (not OVERWRITE_CACHE):
    print(f'Reading cached: {names_cache_file}')
    df = pd.read_parquet(names_cache_file)
else:
    records_accumulator = list()
    for n in names:
        response = requests.get(URL_FORMAT.format(name=n))
        soup = bs.BeautifulSoup(response.text, 'html')
        status_element = soup.select_one(STATUS_SELECTOR)
        if status_element is None:
            print(f'No status for {n}')
            continue
        status_text = status_element.text
        print(f'Status of "{n}":', status_text)
        time.sleep(0.5)
        records_accumulator.append(dict(name=n, status=status_text))
    df = pd.DataFrame(records_accumulator)
    print(f'Caching: {names_cache_file}')
    df.to_parquet(names_cache_file)

Reading cached: data/names_adam_bob_coty.parquet


# Cache + hash

In [18]:
# Gist https://gist.github.com/artoby/49c4d24e28a49f01728a5b4e7e53798a

from typing import Any, Optional
import hashlib
import base64
import json


def get_hash_bytes(value: Any, hash_bytes=10, salt=None) -> Optional[bytes]:
    """
    param hash_bytes: how many bytes to hake for the trimmed hash, 10 bytes of data -> 16 symbols string
    """
    if value is None:
        return None

    if hash_bytes <= 0:
        raise ValueError(f'Hash bytes value should be >0, but not "{hash_bytes}", ignoring it')

    value_str = json.dumps(value, sort_keys=True)

    if (salt is not None) and (salt != ''):
        value_str = value_str + salt

    s_bytes = value_str.encode('utf-8')
    hash_result = hashlib.sha1(s_bytes)
    bytes_taken = hash_result.digest() if hash_bytes is None else hash_result.digest()[-hash_bytes:]
    return bytes_taken


def get_hash(value: Any, hash_bytes=10, salt=None) -> Optional[str]:
    bytes_taken = get_hash_bytes(value=value, hash_bytes=hash_bytes, salt=salt)
    if bytes_taken is None:
        return None

    result = base64.b32encode(bytes_taken).decode('utf-8').rstrip('=')
    return result

In [20]:
get_hash('abc123')

'N3M4DMXOMXKBIP3I'

In [24]:
get_hash('Very very very very very very very very very very very very long string')

'CYTZ4Z4WOFU2QY4R'

In [25]:
get_hash('Very very very very very very very very very very very very long string')

'CYTZ4Z4WOFU2QY4R'

In [23]:
arr = [1, 2, 'abc123', 'Very very very very very very very very very very very very long string']
get_hash(arr)

'WZV47YMVMYPDU242'

In [30]:
names = ['Adam', 'Coty', 'Bob']
names_ordered_and_normalised = sorted([n.lower() for n in names])
names_hash = get_hash(names_ordered_and_normalised)

names_cache_file = CACHE_DIR / f'names_{names_hash}.parquet'

if names_cache_file.is_file() and (not OVERWRITE_CACHE):
    print(f'Reading cached: {names_cache_file}')
    df = pd.read_parquet(names_cache_file)
else:
    records_accumulator = list()
    for n in names:
        response = requests.get(URL_FORMAT.format(name=n))
        soup = bs.BeautifulSoup(response.text, 'html')
        status_element = soup.select_one(STATUS_SELECTOR)
        if status_element is None:
            print(f'No status for {n}')
            continue
        status_text = status_element.text
        print(f'Status of "{n}":', status_text)
        time.sleep(0.5)
        records_accumulator.append(dict(name=n, status=status_text))
    df = pd.DataFrame(records_accumulator)
    print(f'Caching: {names_cache_file}')
    df.to_parquet(names_cache_file)

Reading cached: data/names_WF343E5B6SH4EHWG.parquet


# Self shutdown to release resources

In [None]:
import os

mypid = os.getpid()
os.kill(mypid, 9)