# Examples

## `pythonkit`

In [1]:
import datetime as dt
import functools
import time
import traceback

import time_machine
from toolz import curried

import onekit.pythonkit as pk

### `timestamp`

In [2]:
# mock datetime
traveller = time_machine.travel(dt.datetime(2024, 1, 1, 0, 0, 0))
traveller.start();

In [3]:
pk.timestamp()

'2024-01-01 00:00:00'

In [4]:
pk.timestamp("UTC")

'2024-01-01 00:00:00'

In [5]:
pk.timestamp("CET")

'2024-01-01 01:00:00'

In [6]:
pk.timestamp("US/Hawaii")

'2023-12-31 14:00:00'

In [7]:
pk.timestamp("Asia/Tokyo")

'2024-01-01 09:00:00'

In [8]:
traveller.stop()

### `stopwatch`

In [9]:
# overwrite defaults for illustrative purposes
stopwatch = functools.partial(
    pk.stopwatch,
    fmt="%a, %d %b %Y %H:%M:%S",
    flush=False,
)

# mock datetime
traveller = time_machine.travel(dt.datetime(2024, 1, 1, 12, 0, 0))
traveller.start();

Use as context manager.

Example 1: Measure total elapsed time of multiple statements.

In [10]:
with stopwatch("example 1"):
    time.sleep(0.05)
    time.sleep(0.05)

Mon, 01 Jan 2024 12:00:00 -> Mon, 01 Jan 2024 12:00:00 = 0.100356s - example 1


Example 2: Measure total elapsed time of multiple `stopwatch` instances.

In [11]:
with stopwatch("example 2 - stopwatch 1") as sw1:
    time.sleep(0.05)

with stopwatch("example 2 - stopwatch 2") as sw2:
    time.sleep(0.05)

sw1 + sw2

Mon, 01 Jan 2024 12:00:00 -> Mon, 01 Jan 2024 12:00:00 = 0.050179s - example 2 - stopwatch 1
Mon, 01 Jan 2024 12:00:00 -> Mon, 01 Jan 2024 12:00:00 = 0.050173s - example 2 - stopwatch 2


0.100352s - total elapsed time

In [12]:
sw1.fmt

'%a, %d %b %Y %H:%M:%S'

Example 3: `stopwatch` with different timezone.

In [13]:
with stopwatch("example 3", timezone="CET"):
    time.sleep(0.05)

Mon, 01 Jan 2024 13:00:00 -> Mon, 01 Jan 2024 13:00:00 = 0.050207s - example 3


Use as decorator.

In [14]:
@stopwatch("example 4")
def func_with_supplied_label():
    time.sleep(0.1)

In [15]:
func_with_supplied_label()

Mon, 01 Jan 2024 12:00:00 -> Mon, 01 Jan 2024 12:00:00 = 0.100176s - example 4


In [16]:
@stopwatch()
def func_with_no_supplied_label():
    time.sleep(0.1)

In [17]:
func_with_no_supplied_label()

Mon, 01 Jan 2024 12:00:00 -> Mon, 01 Jan 2024 12:00:00 = 0.100235s - func_with_no_supplied_label


In [18]:
traveller.stop()

### `flatten`

In [19]:
irregular_list = [
    ["one", 2],
    3,
    [(4, "five")],
    [[["six"]]],
    "seven",
    [],
]

list(pk.flatten(irregular_list, 8, [9, ("ten",)]))

['one', 2, 3, 4, 'five', 'six', 'seven', 8, 9, 'ten']

### `highlight_string_differences`

Use in an `assert` statement to get more information:

In [20]:
lft_str = "hello"
rgt_str = "hallo"

Common approach to print values might not be informative enough for subtle differences.

In [21]:
try:
    assert lft_str == rgt_str, f"{lft_str} != {rgt_str}"
except AssertionError:
    traceback.print_exc()

Traceback (most recent call last):
  File "/tmp/ipykernel_61960/54865336.py", line 2, in <module>
    assert lft_str == rgt_str, f"{lft_str} != {rgt_str}"
           ^^^^^^^^^^^^^^^^^^
AssertionError: hello != hallo


With `highlight_string_differences`, it is easier to spot subtle differences.

In [22]:
def get_string_diff(lft_str: str, rgt_str: str) -> str:
    return "lft_str != rgt_str\n" + pk.highlight_string_differences(lft_str, rgt_str)


try:
    assert lft_str == rgt_str, get_string_diff(lft_str, rgt_str)
except AssertionError:
    traceback.print_exc()

Traceback (most recent call last):
  File "/tmp/ipykernel_61960/788221307.py", line 6, in <module>
    assert lft_str == rgt_str, get_string_diff(lft_str, rgt_str)
           ^^^^^^^^^^^^^^^^^^
AssertionError: lft_str != rgt_str
hello
 |   
hallo


### Date computation

#### `date_count_forward`

In [23]:
# month sequence - first date
curried.pipe(
    pk.date_count_forward(dt.date(2024, 1, 1)),
    curried.filter(lambda d: d.day == 1),
    curried.map(pk.date_to_str),
    curried.take(5),
    list,
)

['2024-01-01', '2024-02-01', '2024-03-01', '2024-04-01', '2024-05-01']

In [24]:
# month sequence - last date
curried.pipe(
    pk.date_count_forward(dt.date(2024, 1, 1)),
    curried.filter(lambda d: d.day == 1),
    curried.map(lambda d: pk.last_date_of_month(d.year, d.month)),
    curried.map(pk.date_to_str),
    curried.take(5),
    list,
)

['2024-01-31', '2024-02-29', '2024-03-31', '2024-04-30', '2024-05-31']

In [25]:
# Monday sequence
curried.pipe(
    pk.date_count_forward(dt.date(2024, 1, 1)),
    curried.filter(lambda d: pk.weekday(d) == "Mon"),
    curried.map(pk.date_to_str),
    curried.take(5),
    list,
)

['2024-01-01', '2024-01-08', '2024-01-15', '2024-01-22', '2024-01-29']

In [26]:
# pick every 14th day
curried.pipe(
    pk.date_count_forward(dt.date(2024, 1, 1)),
    curried.take_nth(14),
    curried.map(pk.date_to_str),
    curried.take(5),
    list,
)

['2024-01-01', '2024-01-15', '2024-01-29', '2024-02-12', '2024-02-26']

Task: The digits of 22 February 2022 form [a palindrome and an ambigram](https://en.wikipedia.org/wiki/Twosday) in dd-mm-yyyy format.
List the next five dates with these properties.

In [27]:
def format_date(d: dt.date, with_hyphen: bool) -> str:
    fmt = "%d-%m-%Y" if with_hyphen else "%d%m%Y"
    return d.strftime(fmt)


def is_palindrome_date(d: dt.date) -> bool:
    d_str = format_date(d, False)
    return d_str == d_str[::-1]


def is_ambigram_date(d: dt.date) -> bool:
    d_str = format_date(d, False)
    return set(d_str) <= {"0", "1", "2", "8"}


def show_date(d: dt.date) -> str:
    return f"{format_date(d, True)} ↦ {format_date(d, False)}"


curried.pipe(
    pk.date_count_forward(dt.date(2022, 2, 23)),
    curried.filter(is_palindrome_date),
    curried.filter(is_ambigram_date),
    curried.map(show_date),
    curried.take(5),
    list,
)

['08-02-2080 ↦ 08022080',
 '18-02-2081 ↦ 18022081',
 '28-02-2082 ↦ 28022082',
 '10-12-2101 ↦ 10122101',
 '20-12-2102 ↦ 20122102']

#### `date_range`

In [28]:
# month sequence - first date
curried.pipe(
    pk.date_range(dt.date(2024, 1, 1), dt.date(2024, 5, 31)),
    curried.filter(lambda d: d.day == 1),
    curried.map(pk.date_to_str),
    curried.take(5),
    list,
)

['2024-01-01', '2024-02-01', '2024-03-01', '2024-04-01', '2024-05-01']

In [29]:
# month sequence - last date
curried.pipe(
    pk.date_range(dt.date(2024, 1, 1), dt.date(2024, 5, 31)),
    curried.filter(lambda d: d.day == 1),
    curried.map(lambda d: pk.last_date_of_month(d.year, d.month)),
    curried.map(pk.date_to_str),
    curried.take(5),
    list,
)

['2024-01-31', '2024-02-29', '2024-03-31', '2024-04-30', '2024-05-31']

## `sparkkit`

In [30]:
import os

from pyspark.sql import SparkSession
from pyspark.sql import functions as F

import onekit.sparkkit as sk

In [31]:
spark = (
    SparkSession.builder.master("local[1]")
    .appName("spark-session-docs")
    .config("spark.sql.shuffle.partitions", 1)
    .config("spark.default.parallelism", os.cpu_count())
    .config("spark.rdd.compress", False)
    .config("spark.shuffle.compress", False)
    .config("spark.dynamicAllocation.enabled", False)
    .config("spark.executor.cores", 1)
    .config("spark.executor.instances", 1)
    .config("spark.ui.enabled", False)
    .config("spark.ui.showConsoleProgress", False)
    .getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/22 16:30:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### `union`

In [32]:
df = (
    sk.union(
        spark.createDataFrame([dict(x=1, y=2), dict(x=3, y=4)]),
        spark.createDataFrame([dict(x=5, y=6), dict(x=7, y=8)]),
        spark.createDataFrame([dict(x=0, y=1), dict(x=2, y=3)]),
    )
    .transform(sk.peek(20, shape=True, cache=True, schema=True, index=True))
    .where(F.col("x") + F.col("y") < 10)
    .transform(sk.peek())
)

root
 |-- x: long (nullable = true)
 |-- y: long (nullable = true)

shape = (6, 2)


Unnamed: 0,x,y
1,1,2
2,3,4
3,5,6
4,7,8
5,0,1
6,2,3


x,y
1,2
3,4
0,1
2,3


In [33]:
df.show()

+---+---+
|  x|  y|
+---+---+
|  1|  2|
|  3|  4|
|  0|  1|
|  2|  3|
+---+---+



### `assert_schema_equal`

In [34]:
lft_df = spark.createDataFrame([dict(x=1, y=2), dict(x=3, y=4)])
rgt_df = spark.createDataFrame([dict(x=1), dict(x=3)])

try:
    sk.assert_schema_equal(lft_df, rgt_df)
except sk.SparkkitError:
    traceback.print_exc()

Traceback (most recent call last):
  File "/tmp/ipykernel_61960/791909425.py", line 5, in <module>
    sk.assert_schema_equal(lft_df, rgt_df)
  File "/workspaces/onekit/src/onekit/sparkkit.py", line 429, in assert_schema_equal
    raise SchemaMismatchError(lft_schema, rgt_schema)
onekit.sparkkit.SchemaMismatchError: n_diff=10
struct<x:bigint,y:bigint>
               ||||||||||
struct<x:bigint>


### `assert_row_count_equal`

In [35]:
lft_df = spark.createDataFrame([dict(x=1, y=2), dict(x=3, y=4)])
rgt_df = spark.createDataFrame([dict(x=1)])

try:
    sk.assert_row_count_equal(lft_df, rgt_df)
except sk.SparkkitError:
    traceback.print_exc()

Traceback (most recent call last):
  File "/tmp/ipykernel_61960/3542645773.py", line 5, in <module>
    sk.assert_row_count_equal(lft_df, rgt_df)
  File "/workspaces/onekit/src/onekit/sparkkit.py", line 344, in assert_row_count_equal
    raise RowCountMismatchError(n_lft, n_rgt)
onekit.sparkkit.RowCountMismatchError: n_lft=2, n_rgt=1, n_diff=1


### `assert_row_equal`

In [36]:
lft_df = spark.createDataFrame([dict(x=1, y=2), dict(x=3, y=4)])
rgt_df = spark.createDataFrame([dict(x=3, y=4), dict(x=5, y=6), dict(x=7, y=8)])

try:
    sk.assert_row_equal(lft_df, rgt_df)
except sk.SparkkitError:
    traceback.print_exc()

Traceback (most recent call last):
  File "/tmp/ipykernel_61960/553075025.py", line 5, in <module>
    sk.assert_row_equal(lft_df, rgt_df)
  File "/workspaces/onekit/src/onekit/sparkkit.py", line 387, in assert_row_equal
    raise RowMismatchError(lft_rows, rgt_rows, n_lft, n_rgt)
onekit.sparkkit.RowMismatchError: n_lft=1, n_rgt=2


In [37]:
spark.stop()