In [None]:
import sys

sys.path.insert(0, "../")
sys.path.insert(0, "packages")

In [None]:
import os

if os.environ.get("CIRCLECI"):
    default_env = os.environ.get("CONDA_DEFAULT_ENV")
    os.environ["PYSPARK_DRIVER_PYTHON"] = (
        f"/home/circleci/miniconda/envs/{default_env}/bin/python"
    )
    os.environ["PYSPARK_PYTHON"] = (
        f"/home/circleci/miniconda/envs/{default_env}/bin/python"
    )

# Derived Features

This document outlines the functions available to create derived features using
examples.

We will be using the following dataframe:

In [None]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    ArrayType,
    DateType,
    IntegerType,
    StringType,
    StructField,
    StructType,
)

spark = SparkSession.builder.config("spark.ui.showConsoleProgress", False).getOrCreate()
spark.sparkContext.setLogLevel("OFF")
schema = StructType(
    [
        StructField("name", StringType(), True),
        StructField("date", DateType(), True),
        StructField("date_index", IntegerType(), True),
        StructField("x_flag", IntegerType(), True),
        StructField("y_flag", IntegerType(), True),
    ]
)
data = [
    ("Gendry", pd.Timestamp("2012-05-01").date(), 15461, 1, 0),
    ("Gendry", pd.Timestamp("2012-05-02").date(), 15462, 0, None),
    ("Gendry", pd.Timestamp("2012-05-03").date(), 15463, 1, None),
    ("Gendry", pd.Timestamp("2012-05-04").date(), 15464, 0, 1),
    ("Gendry", pd.Timestamp("2012-05-05").date(), 15465, 1, None),
    ("Arya", pd.Timestamp("2012-05-06").date(), 15466, 1, 0),
    ("Arya", pd.Timestamp("2012-05-07").date(), 15467, 0, None),
    ("Arya", pd.Timestamp("2012-05-08").date(), 15468, 0, 1),
    ("Arya", pd.Timestamp("2012-05-09").date(), 15469, 1, None),
    ("Arya", pd.Timestamp("2012-05-10").date(), 15470, 1, 2),
    ("Cersei", pd.Timestamp("2012-05-10").date(), 15470, 0, 0),
    ("Cersei", pd.Timestamp("2012-05-11").date(), 15471, 1, None),
    ("Cersei", pd.Timestamp("2012-05-12").date(), 15472, 1, None),
    ("Cersei", pd.Timestamp("2012-05-13").date(), 15473, 1, None),
    ("Cersei", pd.Timestamp("2012-05-15").date(), 15475, 0, 1),
]
df_window = spark.createDataFrame(data, schema)

In [None]:
df_window.show()

## Windows

This util provides a function that takes a spark dataframe and creates window
features based on a dictionary or config. It provides the ability to create a single window
feature per entry or can create multiple window features given a grid of columns,
aggregation functions, windows, and ranges. This can be used as a part of the
`create_column_from_config` function.

### `generate_windows_spec`:
Creates a list of `WindowSpec` objects, based on the config passed.
This function can be used when working with custom functions to remove
repetitive window creations.

The output of this function would essentialy be used by a function which
accepts window specs (`generate_window_grid` , `window_column`) or you can
aggregate over this window spec while creating new columns.

Code example below:

In [None]:
from pyspark.sql import functions as f
from feature_generation.v1.core.features.windows import generate_windows_spec

windows_spec = generate_windows_spec(
    partition_by="name", order_by="date_index", range_between=[-5, -1]
)

df_windows_spec = df_window.withColumn(
    "x_flag_min_last_5d", f.min("x_flag").over(windows_spec[0])
)

df_windows_spec.show()

Note that the `generate_windows_spec` returns a list of windows spec.

Which is equivalent to:

In [None]:
from pyspark.sql import Window

df_windows_spec = df_window.withColumn(
    "x_flag_min_last_5d",
    f.min("x_flag").over(
        Window.partitionBy("name").orderBy("date_index").rangeBetween(-5, -1)
    ),
)

df_windows_spec.show()

### `window_column`:
Creates a new column given the window configuration.


Core example below:

In [None]:
from pyspark.sql import functions as f
from pyspark.sql import Window
from feature_generation.v1.core.features.create_column import create_columns_from_config
from feature_generation.v1.core.features.windows import window_column

windows_col_code_config = [
    window_column(
        "x_flag_sum_last_3d",
        f.sum("x_flag"),
        Window.partitionBy("name").orderBy("date_index").rangeBetween(-3, -1),
    ),
    window_column(
        ["x_flag_mean_last_2d", "x_flag_mean_last_3d"],
        f.mean("x_flag"),
        [
            Window.partitionBy("name").orderBy("date_index").rowsBetween(-2, -2),
            Window.partitionBy("name").orderBy("date_index").rowsBetween(-3, -3),
        ],
    ),
]

df_windows_col_code = create_columns_from_config(df_window, windows_col_code_config)

df_windows_col_code.show()

Node example below (we will need to use the `generate_window_spec` function if calling
from parameters):

In [None]:
from feature_generation.v1.nodes.features.create_column import (
    create_columns_from_config,
)
from pyspark.sql import functions as f
from pyspark.sql import Window

windows_col_config = [
    {
        "object": "feature_generation.v1.core.features.windows.window_column",
        "outputs": "x_flag_sum_last_3d",
        "input": {"object": "pyspark.sql.functions.sum", "col": "x_flag"},
        "windows_spec": {
            "object": "feature_generation.v1.core.features.windows.generate_windows_spec",
            "partition_by": "name",
            "order_by": "date_index",
            "range_between": [-3, -1],
        },
    },
    {
        "object": "feature_generation.v1.core.features.windows.window_column",
        "outputs": ["x_flag_mean_last_2d", "x_flag_mean_last_3d"],
        "input": {"object": "pyspark.sql.functions.mean", "col": "x_flag"},
        "windows_spec": {
            "object": "feature_generation.v1.core.features.windows.generate_windows_spec",
            "partition_by": "name",
            "order_by": "date_index",
            "rows_between": [[-2, -2], [-3, -3]],
        },
    },
]

df_windows_col = create_columns_from_config(df_window, windows_col_config)

df_windows_col.show()

### `generate_window_grid`:
Generates window columns given a configuration grid. Grid should contain columns,
aggregation functions, windows, and ranges. Note that the user must ensure
compatibility of all combinations within the grid.


Core example below:

In [None]:
from pyspark.sql import functions as f
from feature_generation.v1.core.features.create_column import create_columns_from_config
from feature_generation.v1.core.features.windows import generate_window_grid

windows_grid_code_config = [
    generate_window_grid(
        inputs=["x_flag", "y_flag"],
        funcs=[f.max, f.sum],
        windows=[
            {"partition_by": ["name"], "order_by": ["date_index"], "descending": False},
            {"partition_by": ["name"], "order_by": ["date_index"], "descending": True},
        ],
        ranges_between=[
            [-1, -1],
            [0, 0],
            [1, 1],
            [-1, "UNBOUNDED FOLLOWING"],
            ["unbounded preceding", 0],
        ],
        negative_term="past",
        positive_term="next",
        suffix="d",
    )
]

df_windows_grid_code = create_columns_from_config(df_window, windows_grid_code_config)

df_windows_grid_code.show()

Node example below:

In [None]:
from feature_generation.v1.nodes.features.create_column import (
    create_columns_from_config,
)

window_grid_config = [
    {
        "object": "feature_generation.v1.core.features.windows.generate_window_grid",
        "inputs": ["x_flag", "y_flag"],
        "funcs": [
            {"object": "pyspark.sql.functions.sum"},
            {"object": "pyspark.sql.functions.max"},
        ],
        "windows": [
            {"partition_by": ["name"], "order_by": ["date_index"], "descending": False},
            {"partition_by": ["name"], "order_by": ["date_index"], "descending": True},
        ],
        "ranges_between": [
            [-1, -1],
            [0, 0],
            [1, 1],
            [-1, "UNBOUNDED FOLLOWING"],
            ["unbounded preceding", 0],
        ],
        "negative_term": "past",
        "positive_term": "next",
        "suffix": "d",
    },
]

df_windows_grid = create_columns_from_config(df_window, window_grid_config)

df_windows_grid.show()

### `generate_distinct_element_window_grid`:
Generates window columns with distinct elements in given a grid. Grid should contain
columns, windows, and ranges. Note that the input columnn should only have array of
elements we want to calculate over a grid.

We will be using the following dataframe for generate_distinct_element_window_grid:

In [None]:
from pyspark.sql.types import (
    ArrayType,
    DateType,
    IntegerType,
    StringType,
    StructField,
    StructType,
)

spark = SparkSession.builder.getOrCreate()

schema = StructType(
    [
        StructField("name", StringType(), True),
        StructField("date", DateType(), True),
        StructField("date_index", IntegerType(), True),
        StructField("x_flag", IntegerType(), True),
        StructField("y_flag", IntegerType(), True),
        StructField("array_col", ArrayType(StringType()), True),
    ]
)
data = [
    (
        "Gendry",
        pd.Timestamp("2012-05-01").date(),
        15461,
        1,
        0,
        (["a", "b", "b", "a", "a"]),
    ),
    (
        "Gendry",
        pd.Timestamp("2012-05-02").date(),
        15462,
        0,
        None,
        (["a", "b", "b", "a", "a"]),
    ),
    (
        "Gendry",
        pd.Timestamp("2012-05-03").date(),
        15463,
        1,
        None,
        (["b", "c", "a", "a"]),
    ),
    (
        "Gendry",
        pd.Timestamp("2012-05-04").date(),
        15464,
        0,
        1,
        (["b", "a", "a"]),
    ),
    (
        "Gendry",
        pd.Timestamp("2012-05-05").date(),
        15465,
        1,
        None,
        (["a", "a"]),
    ),
    (
        "Arya",
        pd.Timestamp("2012-05-06").date(),
        15466,
        1,
        0,
        (["b", "a", "a", "a", "b"]),
    ),
    (
        "Arya",
        pd.Timestamp("2012-05-07").date(),
        15467,
        0,
        None,
        (["b", "a", "a", "a", "b"]),
    ),
    (
        "Arya",
        pd.Timestamp("2012-05-08").date(),
        15468,
        0,
        1,
        (["a", "a", "c", "b"]),
    ),
    (
        "Arya",
        pd.Timestamp("2012-05-09").date(),
        15469,
        1,
        None,
        (["a", "a", "b"]),
    ),
    (
        "Arya",
        pd.Timestamp("2012-05-10").date(),
        15470,
        1,
        2,
        (["b"]),
    ),
    (
        "Cersei",
        pd.Timestamp("2012-05-10").date(),
        15470,
        0,
        0,
        (["a", "b", "c", "d", "a"]),
    ),
    (
        "Cersei",
        pd.Timestamp("2012-05-11").date(),
        15471,
        1,
        None,
        (["a", "b", "a", "b", "a"]),
    ),
    (
        "Cersei",
        pd.Timestamp("2012-05-12").date(),
        15472,
        1,
        None,
        (["b", "c", "b", "a"]),
    ),
    (
        "Cersei",
        pd.Timestamp("2012-05-13").date(),
        15473,
        1,
        None,
        (["a", "b", "a"]),
    ),
    (
        "Cersei",
        pd.Timestamp("2012-05-15").date(),
        15475,
        0,
        1,
        (["b", "a"]),
    ),
]
df_array = spark.createDataFrame(data, schema)

In [None]:
df_array.show(truncate=False)

Core example below:

In [None]:
from feature_generation.v1.nodes.features.create_column import (
    create_columns_from_config,
)
from pyspark.sql import functions as f
from feature_generation.v1.core.features.create_column import create_columns_from_config
from feature_generation.v1.core.features.windows import (
    generate_distinct_element_window_grid,
)

distinct_windows_grid_code_config = [
    generate_distinct_element_window_grid(
        inputs=[
            "array_col",
        ],
        windows=[
            {"partition_by": ["name"], "order_by": ["date_index"], "descending": False},
        ],
        ranges_between=[
            [-1, -1],
        ],
        negative_term="past",
        positive_term="next",
        suffix="d",
    )
]

df_distinct_windows_grid_code = create_columns_from_config(
    df_array, distinct_windows_grid_code_config
)
df_distinct_windows_grid_code.show()

Node example below:

In [None]:
from feature_generation.v1.nodes.features.create_column import (
    create_columns_from_config,
)

distinct_window_grid_config = [
    {
        "object": "feature_generation.v1.core.features.windows.generate_distinct_element_window_grid",
        "inputs": [
            "array_col",
        ],
        "windows": [
            {"partition_by": ["name"], "order_by": ["date_index"], "descending": False},
        ],
        "ranges_between": [
            [-1, -1],
        ],
        "negative_term": "past",
        "positive_term": "next",
        "suffix": "d",
    },
]

df_distinct_windows_grid = create_columns_from_config(
    df_array, distinct_window_grid_config
)

df_distinct_windows_grid.show()

### `generate_window_ratio`:
Generates ratio of window columns given a grid. Grid should contain columns,
aggregation functions, windows, and ranges. Note that the user must ensure compatibility
 of all combinations within the grid.

Core example below:

In [None]:
from pyspark.sql import functions as f
from feature_generation.v1.core.features.create_column import create_columns_from_config
from feature_generation.v1.core.features.windows import generate_window_ratio

df_window = df_window.withColumn("total_flag", f.col("x_flag") + f.col("y_flag"))

windows_ratio_code_config = [
    generate_window_ratio(
        inputs={"x_flag": "total_flag", "y_flag": "total_flag"},
        funcs=[f.max, f.sum],
        windows=[
            {"partition_by": ["name"], "order_by": ["date_index"], "descending": False},
        ],
        ranges_between=[
            [-1, -1],
            [0, 0],
            [1, 1],
        ],
        negative_term="past",
        positive_term="next",
        suffix="d",
    )
]

df_windows_ratio_code = create_columns_from_config(df_window, windows_ratio_code_config)
df_windows_ratio_code.show()

Node example below:

In [None]:
from feature_generation.v1.nodes.features.create_column import (
    create_columns_from_config,
)

df_window = df_window.withColumn("total_flag", f.col("x_flag") + f.col("y_flag"))
windows_ratio_config = [
    {
        "object": "feature_generation.v1.core.features.windows.generate_window_ratio",
        "inputs": {"x_flag": "total_flag", "y_flag": "total_flag"},
        "funcs": [
            {"object": "pyspark.sql.functions.sum"},
            {"object": "pyspark.sql.functions.max"},
        ],
        "windows": [
            {"partition_by": ["name"], "order_by": ["date_index"], "descending": False},
        ],
        "ranges_between": [
            [-1, -1],
            [0, 0],
            [1, 1],
        ],
        "negative_term": "past",
        "positive_term": "next",
        "suffix": "d",
    },
]

df_windows_ratio = create_columns_from_config(df_window, windows_ratio_config)

df_windows_ratio.show()

### `generate_window_delta`:
Generates delta of window columns given a grid. Grid should contain columns,
aggregation functions, windows, and ranges. Note that the user must ensure compatibility
 of all combinations within the grid.

Core example below:

In [None]:
from pyspark.sql import functions as f
from feature_generation.v1.core.features.create_column import create_columns_from_config
from feature_generation.v1.core.features.windows import generate_window_delta

windows_delta_code_config = [
    generate_window_delta(
        inputs=["x_flag", "y_flag"],
        funcs=[f.sum],
        windows=[
            {"partition_by": ["name"], "order_by": ["date_index"], "descending": False},
        ],
        ranges_between=[
            [-1, -1],
            [-1, "UNBOUNDED FOLLOWING"],
        ],
        negative_term="past",
        positive_term="next",
        suffix="d",
    )
]

df_windows_delta_code = create_columns_from_config(df_window, windows_delta_code_config)
df_windows_delta_code.show()

Node example below:

In [None]:
from feature_generation.v1.nodes.features.create_column import (
    create_columns_from_config,
)

windows_delta_code_config = [
    {
        "object": "feature_generation.v1.core.features.windows.generate_window_delta",
        "inputs": ["x_flag", "y_flag"],
        "funcs": [
            {"object": "pyspark.sql.functions.sum"},
        ],
        "windows": [
            {"partition_by": ["name"], "order_by": ["date_index"], "descending": False},
        ],
        "ranges_between": [
            [-1, -1],
            [-1, "UNBOUNDED FOLLOWING"],
        ],
        "negative_term": "past",
        "positive_term": "next",
        "suffix": "d",
    },
]

df_windows_delta = create_columns_from_config(df_window, windows_delta_code_config)

df_windows_delta.show()

## Interacted Features

This function creates multiple columns/features based on the interactions of the
input columns. Two/more base columns are required in order to create an
interaction feature.

We will use the following dataframe to demonstrate creating interacted features.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    ArrayType,
    DateType,
    IntegerType,
    StringType,
    StructField,
    StructType,
)


schema = StructType(
    [
        StructField("id", IntegerType(), True),
        StructField("channel_x", IntegerType(), True),
        StructField("channel_y", IntegerType(), True),
        StructField("product_1", IntegerType(), True),
        StructField("product_2", IntegerType(), True),
        StructField("key_message_a", IntegerType(), True),
        StructField("key_message_b", IntegerType(), True),
        StructField("key_message_c", IntegerType(), True),
    ]
)
data = [
    (1, 0, 0, 0, 0, 0, 0, 0),
    (2, 0, 1, 0, 1, 0, 0, 1),
    (3, 1, 0, 1, 0, 0, 1, 0),
    (4, 1, 1, 1, 1, 1, 0, 0),
]
df_interact = spark.createDataFrame(data, schema)

In [None]:
df_interact.show()

### `create_interaction_features`:
Creates interaction features given a dictionary.

See https://christophm.github.io/interpretable-ml-book/interaction.html for
explanation on interaction features in the machine learning context.

Code example below:

In [None]:
from feature_generation.v1.core.features.interactions import create_interaction_features

df_interaction = create_interaction_features(
    df=df_interact,
    params_interaction=[
        {
            "channel": ["channel_x", "channel_y"],
            "product": ["product_.*"],
            "key_message": ["key_message_.*"],
        }
    ],
    params_spine_cols=["id"],
)
df_interaction.show()

No node example provided as there is no change in terms of parameters provided.