In [None]:
import sys

sys.path.insert(0, "../")
sys.path.insert(0, "packages")

In [None]:
import os

if os.environ.get("CIRCLECI"):
    default_env = os.environ.get("CONDA_DEFAULT_ENV")
    os.environ["PYSPARK_DRIVER_PYTHON"] = (
        f"/home/circleci/miniconda/envs/{default_env}/bin/python"
    )
    os.environ["PYSPARK_PYTHON"] = (
        f"/home/circleci/miniconda/envs/{default_env}/bin/python"
    )

# Feature Generation - Case study

## Overview:
This document explains how various feature generation components could be utilised to create a typical pipeline. Note we will be using the `nodes` functions rather than the `core` functions to simulate parametrisation from a pipeline. The same can all be done in code using the `core` functions. 


## Use case:
We want to analyse the shopping pattern of customers for a chain of supermarkets.

Our end goal is to create the following list of features:

- Number of orders placed by female customers in last 6 months at a given store.
- Number of orders placed by female customers over 60 years in last 6 months at a given store.
- Number of orders containing groceries in last 6 months.
- Percentage of orders placed by female customers over 60 years in last 6 months at a given store.


We will be using the following dataframe:

In [None]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    DateType,
    IntegerType,
    StringType,
    StructField,
    StructType,
)

spark = SparkSession.builder.config("spark.ui.showConsoleProgress", False).getOrCreate()

schema = StructType(
    [
        StructField("store_id", StringType(), True),
        StructField("order_id", IntegerType(), True),
        StructField("product_id", StringType(), True),
        StructField("product_name", StringType(), True),
        StructField("quantity", IntegerType(), True),
        StructField("price", IntegerType(), True),
        StructField("observation_dt", DateType(), True),
        StructField("cust_id", StringType(), True),
        StructField("cust_age", IntegerType(), True),
        StructField("cust_gender", StringType(), True),
        StructField("month_index", IntegerType(), True),
    ]
)
data = [
    (
        "s1",
        1,
        "A",
        "prod_a",
        1,
        10,
        pd.Timestamp("2021-03-01").date(),
        "c1",
        62,
        "f",
        103,
    ),
    (
        "s1",
        1,
        "A",
        "prod_a",
        1,
        10,
        pd.Timestamp("2021-03-01").date(),
        "c1",
        62,
        "f",
        103,
    ),
    (
        "s1",
        2,
        "B",
        "prod_b",
        1,
        10,
        pd.Timestamp("2021-01-01").date(),
        "c1",
        22,
        "m",
        101,
    ),
    (
        "s1",
        3,
        "C",
        "prod_c",
        1,
        10,
        pd.Timestamp("2021-02-01").date(),
        "c1",
        42,
        "f",
        102,
    ),
    (
        "s1",
        3,
        "C",
        "prod_c",
        1,
        10,
        pd.Timestamp("2021-02-01").date(),
        "c1",
        42,
        "f",
        102,
    ),
    (
        "s1",
        4,
        "B",
        "prod_b",
        1,
        10,
        pd.Timestamp("2021-04-01").date(),
        "c1",
        22,
        "m",
        104,
    ),
    (
        "s1",
        4,
        "B",
        "prod_b",
        1,
        10,
        pd.Timestamp("2021-04-01").date(),
        "c1",
        22,
        "m",
        104,
    ),
    (
        "s1",
        4,
        "B",
        "prod_b",
        1,
        10,
        pd.Timestamp("2021-04-01").date(),
        "c1",
        22,
        "m",
        104,
    ),
    (
        "s2",
        5,
        "D",
        "prod_d",
        1,
        10,
        pd.Timestamp("2021-05-01").date(),
        "c1",
        65,
        "f",
        105,
    ),
    (
        "s2",
        5,
        "D",
        "prod_d",
        1,
        10,
        pd.Timestamp("2021-05-01").date(),
        "c1",
        65,
        "f",
        105,
    ),
    (
        "s3",
        6,
        "E",
        "prod_e",
        1,
        10,
        pd.Timestamp("2021-03-01").date(),
        "c1",
        65,
        "m",
        103,
    ),
    (
        "s3",
        7,
        "F",
        "prod_f",
        1,
        10,
        pd.Timestamp("2021-01-01").date(),
        "c1",
        61,
        "f",
        101,
    ),
    (
        "s3",
        7,
        "F",
        "prod_f",
        1,
        10,
        pd.Timestamp("2021-01-01").date(),
        "c1",
        61,
        "f",
        101,
    ),
    (
        "s3",
        8,
        "G",
        "prod_g",
        1,
        10,
        pd.Timestamp("2021-04-01").date(),
        "c1",
        32,
        "f",
        104,
    ),
    (
        "s3",
        8,
        "G",
        "prod_g",
        1,
        10,
        pd.Timestamp("2021-04-01").date(),
        "c1",
        32,
        "f",
        104,
    ),
    (
        "s3",
        9,
        "H",
        "prod_h",
        1,
        10,
        pd.Timestamp("2021-05-01").date(),
        "c1",
        41,
        "m",
        105,
    ),
    (
        "s1",
        10,
        "I",
        "prod_i",
        1,
        10,
        pd.Timestamp("2021-06-01").date(),
        "c1",
        62,
        "m",
        106,
    ),
    (
        "s1",
        10,
        "I",
        "prod_i",
        1,
        10,
        pd.Timestamp("2021-06-01").date(),
        "c1",
        62,
        "m",
        106,
    ),
    (
        "s1",
        10,
        "I",
        "prod_i",
        1,
        10,
        pd.Timestamp("2021-06-01").date(),
        "c1",
        62,
        "m",
        106,
    ),
    (
        "s1",
        11,
        "J",
        "prod_j",
        1,
        10,
        pd.Timestamp("2021-07-01").date(),
        "c1",
        21,
        "f",
        107,
    ),
    (
        "s4",
        12,
        "J",
        "prod_j",
        1,
        10,
        pd.Timestamp("2021-02-01").date(),
        "c1",
        21,
        "f",
        102,
    ),
    (
        "s1",
        13,
        "A",
        "prod_a",
        1,
        10,
        pd.Timestamp("2021-01-01").date(),
        "c1",
        62,
        "f",
        101,
    ),
    (
        "s4",
        14,
        "C",
        "prod_c",
        1,
        10,
        pd.Timestamp("2021-01-01").date(),
        "c1",
        42,
        "f",
        101,
    ),
    (
        "s1",
        11,
        "J",
        "prod_j",
        1,
        10,
        pd.Timestamp("2021-01-01").date(),
        "c1",
        21,
        "f",
        101,
    ),
]
df_events = spark.createDataFrame(data, schema)

## Components:

### Primary:

In [None]:
df_events.show()

### Tags creation:
Rows in the primary table are tagged based on requirement.

In [None]:
import yaml

tag_config_string = """
  - "tag": "female"
    "object": "feature_generation.v1.core.tags.tags.isin"
    "input": "cust_gender"
    "values": [ "f" ]
  - "tag": "over60"
    "object": "feature_generation.v1.core.tags.tags.expr_tag"
    "expr": "cust_age > 60"
  - "tag": "grocery"
    "object": "feature_generation.v1.core.tags.tags.isin"
    "input": "product_name"
    "values": ["prod_a", "prod_c"]
"""

In [None]:
from feature_generation.v1.nodes.tags.generate_tags import create_tags_from_config

print(tag_config_string)
tag_config = yaml.safe_load(tag_config_string)
df_tag = create_tags_from_config(df_events, tag_config)
df_tag.show(truncate=False)

### Expand tags/Conversion to flags: Tags are converted to individual flags.

In [None]:
list_of_tags = ["female", "over60", "grocery"]

key_cols = ["store_id", "observation_dt", "month_index"]

column_instructions = {
    "is_female": {
        "object": "pyspark.sql.functions.max",
        "col": "female",
    },
    "cust_age_over_60": {
        "object": "pyspark.sql.functions.max",
        "col": "over60",
    },
    "grocery": {
        "object": "pyspark.sql.functions.max",
        "col": "grocery",
    },
}


from feature_generation.v1.nodes.tags.expand_tags import expand_tags, expand_tags_all

# Option 1: Giving explicit list of tags
expand_tags_df = expand_tags(
    df_with_tags=df_tag,
    tags_to_convert=list_of_tags,
)

# Option 2: Expanding all tags automatically
expand_tags_all_df = expand_tags_all(
    df_with_tags=df_tag,
)

expand_tags_df.show(truncate=False)

### Derived flags:

In [None]:
from feature_generation.v1.nodes.features.create_column import (
    create_columns_from_config,
)

flags_config = [
    {
        "object": "feature_generation.v1.core.features.flags.expr_col",
        "output": "is_female_over60",
        "expr": "case when female = 1 and over60 = 1 then 1 else 0 end",
    },
]

df_flags = create_columns_from_config(expand_tags_df, flags_config)

df_flags.show()

### Aggregate: Aggregate flags at spine's granularity.

In [None]:
from feature_generation.v1.nodes.aggregation.aggregate import aggregate_attributes

key_cols = ["store_id", "observation_dt", "month_index"]

agg_config = {
    "ftr_cnt_female_orders": {
        "object": "pyspark.sql.functions.sum",
        "col": "female",
    },
    "ftr_cnt_total_orders": {
        "object": "pyspark.sql.functions.count",
        "col": "order_id",
    },
    "ftr_cnt_female_over60_orders": {
        "object": "pyspark.sql.functions.sum",
        "col": "is_female_over60",
    },
}


df_agg = aggregate_attributes(
    df=df_flags, key_cols=key_cols, column_instructions=agg_config
)

In [None]:
df_agg = df_agg.orderBy("store_id", "observation_dt")

In [None]:
df_agg.show()

### Windows: Calculate features over required windows.

In [None]:
from feature_generation.v1.nodes.features.create_column import (
    create_columns_from_config,
)

window_grid_config = [
    {
        "object": "feature_generation.v1.core.features.windows.generate_window_grid",
        "inputs": [
            "ftr_cnt_female_orders",
            "ftr_cnt_total_orders",
            "ftr_cnt_female_over60_orders",
        ],
        "funcs": [
            {"object": "pyspark.sql.functions.sum"},
        ],
        "windows": [
            {"partition_by": ["store_id"], "order_by": ["month_index"]},
        ],
        "ranges_between": [
            [-6, -1],
            [-4, -1],
        ],
        "suffix": "m",
    },
]

df_windows_grid = create_columns_from_config(df_agg, window_grid_config)

df_windows_grid.show()

### Derived windows: Create derived window features (example: percentage over window)

In [None]:
from feature_generation.v1.nodes.features.create_column import (
    create_columns_from_config,
)

derived_config = [
    {
        "object": "feature_generation.v1.core.features.flags.expr_col",
        "output": "ftr_perc_female_orders_sum_past_6_past_1m",
        "expr": "ftr_cnt_female_orders_sum_past_6_past_1m/ftr_cnt_total_orders_sum_past_6_past_1m",
    },
]

df_windows_derived = create_columns_from_config(df_windows_grid, derived_config)

df_windows_derived.show()

### Alternative step to generate windows and derived windows

Since windows and derived windows both utilise the same function and input dataframe, they could be clubbed into a single node with each step running in series.

In [None]:
from feature_generation.v1.nodes.features.create_column import (
    create_columns_from_config,
)

feature_config = [
    {
        "object": "feature_generation.v1.core.features.windows.generate_window_grid",
        "inputs": [
            "ftr_cnt_female_orders",
            "ftr_cnt_total_orders",
            "ftr_cnt_female_over60_orders",
        ],
        "funcs": [
            {"object": "pyspark.sql.functions.sum"},
        ],
        "windows": [
            {"partition_by": ["store_id"], "order_by": ["month_index"]},
        ],
        "ranges_between": [
            [-6, -1],
            [-4, -1],
        ],
        "suffix": "m",
    },
    {
        "object": "feature_generation.v1.core.features.flags.expr_col",
        "output": "ftr_perc_female_orders_sum_past_6_past_1m",
        "expr": "ftr_cnt_female_orders_sum_past_6_past_1m/ftr_cnt_total_orders_sum_past_6_past_1m",
    },
]

df_overall_features = create_columns_from_config(
    df=df_agg, column_instructions=feature_config, sequential=True
)

df_overall_features.show()