In [None]:
import sys

sys.path.insert(0, "../")
sys.path.insert(0, "packages")

In [None]:
import os

if os.environ.get("CIRCLECI"):
    default_env = os.environ.get("CONDA_DEFAULT_ENV")
    os.environ["PYSPARK_DRIVER_PYTHON"] = (
        f"/home/circleci/miniconda/envs/{default_env}/bin/python"
    )
    os.environ["PYSPARK_PYTHON"] = (
        f"/home/circleci/miniconda/envs/{default_env}/bin/python"
    )

# Custom Window Functions

This document outlines how to create features using the available custom window functions via examples.

The custom window sub-module aims to create window features with custom user-defined functions, that can be applied with `generate_window_grid` or `window_column`.

We will be using the following dataframe throughout our examples:

In [None]:
import pandas as pd
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    DateType,
    IntegerType,
    StringType,
    StructField,
    StructType,
)

spark = (
    SparkSession.builder.config("spark.ui.showConsoleProgress", False)
    .config("spark.sql.shuffle.partitions", 1)
    .getOrCreate()
)


schema = StructType(
    [
        StructField("name", StringType(), True),
        StructField("date", DateType(), True),
        StructField("date_index", IntegerType(), True),
        StructField("y_flag", IntegerType(), True),
        StructField("x_flag", IntegerType(), True),
    ]
)
data = [
    ("Gendry", pd.Timestamp("2012-05-01").date(), 15461, 1, 2),
    ("Gendry", pd.Timestamp("2012-05-02").date(), 15462, 0, None),
    ("Gendry", pd.Timestamp("2012-05-03").date(), 15463, 1, None),
    ("Gendry", pd.Timestamp("2012-05-04").date(), 15464, 0, 1),
    ("Gendry", pd.Timestamp("2012-05-05").date(), 15465, 1, None),
    ("Arya", pd.Timestamp("2012-05-06").date(), 15466, 1, 0),
    ("Arya", pd.Timestamp("2012-05-07").date(), 15467, 0, None),
    ("Arya", pd.Timestamp("2012-05-08").date(), 15468, 0, 1),
    ("Arya", pd.Timestamp("2012-05-09").date(), 15469, 1, None),
    ("Arya", pd.Timestamp("2012-05-10").date(), 15470, 1, 2),
    ("Cersei", pd.Timestamp("2012-05-10").date(), 15470, 0, 3),
    ("Cersei", pd.Timestamp("2012-05-11").date(), 15471, 1, None),
    ("Cersei", pd.Timestamp("2012-05-12").date(), 15472, 1, None),
    ("Cersei", pd.Timestamp("2012-05-13").date(), 15473, 1, None),
    ("Cersei", pd.Timestamp("2012-05-15").date(), 15475, 0, 1),
]
df_window = spark.createDataFrame(data, schema)

In [None]:
df_window.show()

The current list of custom window functions comprises 2 main functions:

In [None]:
from feature_generation.v1.core.features import custom_windows
import inspect


module = custom_windows
ignore_prefix = "_"
allow_list = []

found_functions = []
for func_name, func in inspect.getmembers(module):
    is_function = inspect.isfunction(func)
    is_defined_in_module = inspect.getmodule(func) == module
    is_allowed = func_name in allow_list
    is_not_ignored = (ignore_prefix == "") or not (func_name.startswith(ignore_prefix))
    flag = (is_allowed) or (is_function and is_defined_in_module and is_not_ignored)
    if flag:
        found_functions.append((func_name, func))

rows = []
for func_name, func in found_functions:
    x_doc = func.__doc__.split("\n")[0]
    rows.append(
        {"function": "{}.{}".format(module.__name__, func_name), "description": x_doc}
    )

from tabulate import tabulate
import pandas as pd

table = pd.DataFrame(rows)
print(tabulate(table, headers=table.columns, tablefmt="psql"))

## `complete_*` functions
  
These UDFs performs operations only if entire window range is complete.

This is useful when computing forward-looking target variables (rangeBetween positive)
or backward-looking features (rangeBetween negative), from continuous data sequences,
that should only be computed for windows where all data entries are known
(e.g., if in February we count the number of events X between January and June,
the return value will be null because the data is not complete).

It's intended to work based on defined `range_between`. For instance:

Will work:
  - complete_sum(2) & ranges_between = [[1, 2]]
  - complete_sum(12) & ranges_between = [[1, 12]]
  - complete_sum(6) & ranges_between = [[3, 8]]
  - complete_sum(6) & ranges_between = [[-6, -1]]

Won't work (meaningless results):
  - complete_sum(2) & ranges_between = [[1, 3]]
  - complete_sum(12) & ranges_between = [[1, 6]]
  - complete_sum(6) & ranges_between = [[3, 12]]
  - complete_sum(6) & ranges_between = [[-5, -1]]

Note that input (spine) needs to be continuous historical data
(e.g. no date gaps for each UOA).

This is a known limitation at the moment.

In [None]:
from feature_generation.v1.core.features.create_column import create_columns_from_config
from feature_generation.v1.core.features.windows import (
    generate_window_grid,
    window_column,
    generate_windows_spec,
)
from feature_generation.v1.core.features.custom_windows import (
    complete_sum,
    complete_max,
)

windows_grid_code_config = [
    # Features
    generate_window_grid(
        inputs=["x_flag"],
        funcs=[pyspark.sql.functions.sum, complete_sum(2, return_dtype="integer")],
        windows=[{"partition_by": ["name"], "order_by": ["date_index"]}],
        ranges_between=[[-2, -1]],
    ),
    window_column(
        outputs="x_flag_complete_sum_past_2_1_wc",
        input=complete_sum(2, return_dtype="integer")(df_window["x_flag"]),
        windows_spec=generate_windows_spec(
            partition_by="name", order_by="date_index", range_between=[-2, -1]
        ),
    ),
    # Target
    generate_window_grid(
        inputs=["y_flag"],
        funcs=[pyspark.sql.functions.max, complete_max(2, return_dtype="integer")],
        windows=[{"partition_by": ["name"], "order_by": ["date_index"]}],
        ranges_between=[[1, 2]],
    ),
    window_column(
        outputs="y_flag_complete_max_next_1_2_wc",
        input=complete_max(2, return_dtype="integer")(df_window["y_flag"]),
        windows_spec=generate_windows_spec(
            partition_by="name", order_by="date_index", range_between=[1, 2]
        ),
    ),
]

df_windows_grid_code = create_columns_from_config(df_window, windows_grid_code_config)

In [None]:
df_windows_grid_code.show()