# Examples

## `pythonkit`

In [1]:
import datetime as dt
import functools
import time
import traceback

import time_machine
from toolz import curried

from onekit import pythonkit as pk
from onekit import timekit as tk

### `flatten`

In [2]:
irregular_list = [
    ["one", 2],
    3,
    [(4, "five")],
    [[["six"]]],
    "seven",
    [],
]

list(pk.flatten(irregular_list, 8, [9, ("ten",)]))

['one', 2, 3, 4, 'five', 'six', 'seven', 8, 9, 'ten']

### `highlight_string_differences`

Use in an `assert` statement to get more information:

In [3]:
lft_str = "hello"
rgt_str = "hallo"

Common approach to print values might not be informative enough for subtle differences.

In [4]:
try:
    assert lft_str == rgt_str, f"{lft_str} != {rgt_str}"
except AssertionError:
    traceback.print_exc()

Traceback (most recent call last):
  File "/var/folders/zd/ffdggf1d63v1hgsm0s2ddt480000gn/T/ipykernel_14667/54865336.py", line 2, in <module>
    assert lft_str == rgt_str, f"{lft_str} != {rgt_str}"
           ^^^^^^^^^^^^^^^^^^
AssertionError: hello != hallo


With `highlight_string_differences`, it is easier to spot subtle differences.

In [5]:
def get_string_diff(lft_str: str, rgt_str: str) -> str:
    return "lft_str != rgt_str\n" + pk.highlight_string_differences(lft_str, rgt_str)


try:
    assert lft_str == rgt_str, get_string_diff(lft_str, rgt_str)
except AssertionError:
    traceback.print_exc()

Traceback (most recent call last):
  File "/var/folders/zd/ffdggf1d63v1hgsm0s2ddt480000gn/T/ipykernel_14667/788221307.py", line 6, in <module>
    assert lft_str == rgt_str, get_string_diff(lft_str, rgt_str)
           ^^^^^^^^^^^^^^^^^^
AssertionError: lft_str != rgt_str
hello
 |   
hallo


## `timekit`

### `timestamp`

In [6]:
# mock datetime
traveller = time_machine.travel(dt.datetime(2024, 1, 1, 0, 0, 0))
traveller.start();

In [7]:
tk.timestamp()

'2024-01-01T01:00:00.003316+01:00'

In [8]:
tk.timestamp("UTC")

'2024-01-01T00:00:00.009919+00:00'

In [9]:
tk.timestamp("CET")

'2024-01-01T01:00:00.015242+01:00'

In [10]:
tk.timestamp("US/Hawaii")

'2023-12-31T14:00:00.019510-10:00'

In [11]:
tk.timestamp("Asia/Tokyo")

'2024-01-01T09:00:00.026142+09:00'

In [12]:
traveller.stop()

### `stopwatch`

In [13]:
# mock datetime
traveller = time_machine.travel(dt.datetime(2024, 1, 1, 12, 0, 0))
traveller.start();

Use as context manager.

In [14]:
with tk.stopwatch():
    time.sleep(0.05)

2024-01-01T13:00:00.003586+01:00 -> 2024-01-01T13:00:00.055359+01:00 took 0.051773s


`stopwatch` with different timezone.

In [15]:
with tk.stopwatch("timezone example", timezone="UTC"):
    time.sleep(0.05)

2024-01-01T12:00:00.058243+00:00 -> 2024-01-01T12:00:00.113373+00:00 took 0.05513s - timezone example


In [16]:
with tk.stopwatch("format example", timezone="UTC", fmt="%a, %d %b %Y %H:%M:%S"):
    time.sleep(0.05)

Mon, 01 Jan 2024 12:00:00 -> Mon, 01 Jan 2024 12:00:00 took 0s - format example


Use as decorator.

In [17]:
@tk.stopwatch()
def func():
    time.sleep(0.1)

In [18]:
func()

2024-01-01T13:00:00.181272+01:00 -> 2024-01-01T13:00:00.286460+01:00 took 0.105188s - func


In [19]:
@tk.stopwatch("my label")
def func_with_label():
    time.sleep(0.1)

In [20]:
func_with_label()

2024-01-01T13:00:00.296208+01:00 -> 2024-01-01T13:00:00.398361+01:00 took 0.102153s - my label


In [21]:
traveller.stop()

## `sparkkit`

In [22]:
import os

from pyspark.sql import SparkSession
from pyspark.sql import functions as F

from onekit import sparkkit as sk
from onekit.exception import OnekitError

In [23]:
spark = (
    SparkSession.builder.master("local[1]")
    .appName("spark-session-docs")
    .config("spark.sql.shuffle.partitions", 1)
    .config("spark.default.parallelism", os.cpu_count())
    .config("spark.rdd.compress", False)
    .config("spark.shuffle.compress", False)
    .config("spark.dynamicAllocation.enabled", False)
    .config("spark.executor.cores", 1)
    .config("spark.executor.instances", 1)
    .config("spark.ui.enabled", False)
    .config("spark.ui.showConsoleProgress", False)
    .getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/22 07:41:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### `union` + `peek`

In [24]:
df = (
    sk.union(
        spark.createDataFrame([dict(x=1, y=2.718, z="a"), dict(x=3, y=4.14, z=None)]),
        spark.createDataFrame([dict(x=5, y=None, z="c"), dict(x=7, y=8.28, z="d")]),
        spark.createDataFrame(
            [dict(x=0, y=1.414, z=None), dict(x=2000, y=3000.1, z="f")]
        ),
    )
    .transform(
        lambda df: sk.peek(
            df, n=20, shape=True, cache=True, schema=True, label="before"
        )
    )
    .where(F.col("x") + F.col("y") < 10)
    .transform(lambda df: sk.peek(df, label="after"))
)

root
 |-- x: long (nullable = true)
 |-- y: double (nullable = true)
 |-- z: string (nullable = true)

shape=(6, 3)


Unnamed: 0,x,y,z
1,1,2.718,a
2,3,4.14,
3,5,,c
4,7,8.28,d
5,0,1.414,
6,2_000,3_000.1,f


Unnamed: 0,x,y,z
1,1,2.718,a
2,3,4.14,
3,0,1.414,


In [25]:
df.show()

+---+-----+----+
|  x|    y|   z|
+---+-----+----+
|  1|2.718|   a|
|  3| 4.14|NULL|
|  0|1.414|NULL|
+---+-----+----+



### `assert_schema_equal`

In [26]:
lft_df = spark.createDataFrame([dict(x=1, y=2), dict(x=3, y=4)])
rgt_df = spark.createDataFrame([dict(x=1), dict(x=3)])

try:
    sk.assert_schema_equal(lft_df, rgt_df)
except OnekitError:
    traceback.print_exc()

Traceback (most recent call last):
  File "/var/folders/zd/ffdggf1d63v1hgsm0s2ddt480000gn/T/ipykernel_14667/3363102269.py", line 5, in <module>
    sk.assert_schema_equal(lft_df, rgt_df)
  File "/Users/eugen/Workspace/onekit/src/onekit/sparkkit.py", line 389, in assert_schema_equal
    raise SchemaMismatchError(lft_schema, rgt_schema)
onekit.exception.SchemaMismatchError: num_diff=10
struct<x:bigint,y:bigint>
               ||||||||||
struct<x:bigint>


### `assert_row_count_equal`

In [27]:
lft_df = spark.createDataFrame([dict(x=1, y=2), dict(x=3, y=4)])
rgt_df = spark.createDataFrame([dict(x=1)])

try:
    sk.assert_row_count_equal(lft_df, rgt_df)
except OnekitError:
    traceback.print_exc()

Traceback (most recent call last):
  File "/var/folders/zd/ffdggf1d63v1hgsm0s2ddt480000gn/T/ipykernel_14667/2179063999.py", line 5, in <module>
    sk.assert_row_count_equal(lft_df, rgt_df)
  File "/Users/eugen/Workspace/onekit/src/onekit/sparkkit.py", line 302, in assert_row_count_equal
    raise RowCountMismatchError(num_lft, num_rgt)
onekit.exception.RowCountMismatchError: num_lft=2, num_rgt=1, num_diff=1


### `assert_row_value_equal`

In [28]:
lft_df = spark.createDataFrame([dict(x=1, y=2), dict(x=3, y=4)])
rgt_df = spark.createDataFrame([dict(x=3, y=4), dict(x=5, y=6), dict(x=7, y=8)])

try:
    sk.assert_row_value_equal(lft_df, rgt_df)
except OnekitError:
    traceback.print_exc()

Traceback (most recent call last):
  File "/var/folders/zd/ffdggf1d63v1hgsm0s2ddt480000gn/T/ipykernel_14667/2487993813.py", line 5, in <module>
    sk.assert_row_value_equal(lft_df, rgt_df)
  File "/Users/eugen/Workspace/onekit/src/onekit/sparkkit.py", line 346, in assert_row_value_equal
    raise RowValueMismatchError(lft_rows, rgt_rows, num_lft, num_rgt)
onekit.exception.RowValueMismatchError: num_lft=1, num_rgt=2


In [29]:
spark.stop()