In [None]:
import sys

sys.path.insert(0, "../")
sys.path.insert(0, "packages")

In [None]:
import os

if os.environ.get("CIRCLECI"):
    default_env = os.environ.get("CONDA_DEFAULT_ENV")
    os.environ["PYSPARK_DRIVER_PYTHON"] = (
        f"/home/circleci/miniconda/envs/{default_env}/bin/python"
    )
    os.environ["PYSPARK_PYTHON"] = (
        f"/home/circleci/miniconda/envs/{default_env}/bin/python"
    )

# Tags

The tags sub-module is a central component of the `feature_generation` module. Think of a
tag as generating metadata on top of the data, very much like a hashtag. The process is
almost like going through and defining tags whenever we see certain keywords or conditions
in the data.

While tags can be used to create derived features, they should mainly be used to create
base features.

## Creating Tags

Below are the 3 main node functions you can use to generate tags:

In [None]:
from feature_generation.v1.nodes.tags.generate_tags import (
    create_tags_from_config,
    create_tags_from_config_broadcast,  # noqa: F401
    create_tags_from_config_select,
)

The main difference between broadcast and select tags is if the tag creation depends
only on one column then it creates tags using `broadcast` function otherwise
it creates tags using `select` function. The broadcast method has shown to reduce
run time from several hours to order of 10s of minutes in big data (TBs scale), but
will only work for certain tag functions. We decided to blend them to make the experience
more seemless and try to auto-infer which tags can be created using `broadcast` and which
cannot.

## Deep Dive on Broadcast
This section roughly outlines how the broadcast method works.

Imagine if you have a 1 billion row dataset, but within the column there's only 100k
unique values. The simple approach is to compute for all 1 billion rows. As it turns out,
this is not the most efficient way.

If you collect the 100k unique values into a smaller dataframe, compute on the 100k only,
then broadcast join back to the full 1 billion rows, despite having more steps, this
process is significantly faster then doing a brute force compute on 1 billion rows.
The efficacy of this method depends on the ratio of rows to unique values.

On several engagements (different data, different infrastructure), the run time of
generating tags was in the order of several hours using the brute force method, but
took only in the order of ~10 minutes to compute using the broadcast method. Note that
the broadcast method can only be used to generate certain types of features.

## Available Tag Functions
Below are the available tag functions within the tags module:

In [None]:
from feature_generation.v1.core.tags import tags
from types import FunctionType

list_of_tags = [
    x
    for x in dir(tags)
    if isinstance(getattr(tags, x), FunctionType) and not x.startswith("_")
]

In [None]:
from tabulate import tabulate

rows = []
for x in list_of_tags:
    x_doc = getattr(tags, x).__doc__.split("\n")[0]
    rows.append({"function": x, "description": x_doc})

import pandas as pd

table = pd.DataFrame(rows)
print(tabulate(table, headers=table.columns, tablefmt="psql"))

Not all tag functions are demonstrated in this document, so do check out the source
and docstrings for more information.

In [None]:
import yaml
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    ArrayType,
    DoubleType,
    IntegerType,
    StringType,
    StructField,
    StructType,
)


from feature_generation.v1.nodes.tags.generate_tags import create_tags_from_config

spark = (
    SparkSession.builder.config("spark.ui.showConsoleProgress", False)
    .config("spark.sql.shuffle.partitions", 1)
    .getOrCreate()
)

schema = StructType(
    [
        StructField("name", StringType(), True),
        StructField("age", IntegerType(), True),
        StructField("gender", StringType(), True),
        StructField("occupation", StringType(), True),
        StructField("house", ArrayType(StringType()), True),
        StructField("number", ArrayType(IntegerType()), True),
    ]
)
data = [
    ("Gendry", 31, "male", "Data Engineer", ["House Baratheon"], [1, 2, 3]),
    ("Jaime", 12, "male", "Data Scientist", ["House Lannister"], [2, 3, 4]),
    ("Tyrion", 65, "m", "Data Analyst", ["House Lannister", "House Stark"], [3, 4, 5]),
    ("Cersei", 29, "female", "Engagement Manager", ["House Lannister"], [5, 6, 7]),
    (
        "Jon",
        31,
        "male",
        "Software Engineer",
        ["House Targaryen", "House Stark"],
        [6, 7, 8],
    ),
    ("Arya", 27, "f", "MLE", ["House Stark"], [7, 8, 9]),
    (
        "Sansa",
        26,
        "f",
        "daata translator",
        ["House Stark", "House Lannister"],
        [8, 9, 10],
    ),
    ("Daenerys", 36, "female", "Mother of Dragons", ["House Targaryen"], [9, 11, 13]),
]

df_sample = spark.createDataFrame(data, schema)

tag_schema = StructType(
    [
        StructField("series_season", IntegerType(), True),
        StructField("name", StringType(), True),
        StructField("age", IntegerType(), True),
        StructField("gender", StringType(), True),
        StructField("occupation", StringType(), True),
        StructField("house", ArrayType(StringType()), True),
        StructField("number", ArrayType(IntegerType()), True),
        StructField(
            "tags",
            ArrayType(
                StructType(
                    [
                        StructField("tag", StringType(), True),
                        StructField("value", DoubleType(), True),
                    ]
                )
            ),
        ),
    ]
)
tag_data = [
    (
        1,
        "Gendry",
        31,
        "male",
        "Data Engineer",
        ["House Baratheon"],
        [1, 2, 3],
        [("occupation-data", 1.0)],
    ),
    (
        1,
        "Jaime",
        12,
        "male",
        "Data Scientist",
        ["House Lannister"],
        [2, 3, 4],
        [("occupation-data", 1.0)],
    ),
    (
        1,
        "Tyrion",
        65,
        "male",
        "Data Analyst",
        ["House Lannister", "House Stark"],
        [3, 4, 5],
        [("occupation-data", 1.0)],
    ),
    (
        1,
        "Cersei",
        29,
        "female",
        "Engagement Manager",
        ["House Lannister"],
        [5, 6, 7],
        [("occupation-not-data", 1.0)],
    ),
    (
        1,
        "Jon",
        31,
        "male",
        "Software Engineer",
        ["House Targaryen", "House Stark"],
        [6, 7, 8],
        [("extra-tag", 1.0)],
    ),
    (
        1,
        "Arya",
        27,
        "female",
        "MLE",
        ["House Stark"],
        [7, 8, 9],
        [("occupation-not-data", 1.0)],
    ),
    (
        1,
        "Sansa",
        26,
        "female",
        "daata translator",
        ["House Stark", "House Lannister"],
        [8, 9, 10],
        [("occupation-data", 1.0), ("extra-tag", 1.0)],
    ),
    (
        1,
        "Daenerys",
        36,
        "female",
        "Mother of Dragons",
        ["House Targaryen"],
        [9, 11, 13],
        [("occupation-not-data", 1.0)],
    ),
]

df_with_tag_sample = spark.createDataFrame(tag_data, tag_schema)

We will use the following dataframe to demonstrate the various tag functions:

In [None]:
df_sample.show(truncate=False)

df_with_tag_sample.show(truncate=False)

### `isin`
This generates a tag column if any of the list values is found in the column.
This tag function can be used for creating fundamental, derived and complex tags.

Lets check the dataframe with tags:

In [None]:
from feature_generation.v1.core.tags.generate_tags import create_tags_from_config_select
from feature_generation.v1.core.tags.tags import isin

tag_config = [
    isin(tag="gender_is_male", input="gender", values=["m", "male"]),
    isin(tag="gender_is_female", input="gender", values=["f", "female"]),
]

results = create_tags_from_config_select(df_sample, tag_config)

results.show(truncate=False)

For those who wish to perform the same via configuration, here is an example for the
same config:

In [None]:
import yaml

tag_config_string = """
  - "object": "feature_generation.v1.core.tags.tags.isin"
    "tag": "gender_is_male"
    "input": "gender"
    "values": ["m", "male"]
  - "object": "feature_generation.v1.core.tags.tags.isin"
    "tag": "gender_is_female"
    "input": "gender"
    "values": ["f", "female"]
"""

In [None]:
from feature_generation.v1.nodes.tags.generate_tags import create_tags_from_config

print(tag_config_string)
tag_config = yaml.safe_load(tag_config_string)
same_result = create_tags_from_config(df_sample, tag_config)
same_result.show(truncate=False)

Note the change of the import from `core` to `node`. For the rest of tutorial, we will
be demonstrating via the code examples rather than config.


### `rlike`
This generates a tag column if the regex pattern can be found in the
column. This tag function can be used for creating fundamental,
derived and complex tags.

Usage:

In [None]:
from feature_generation.v1.core.tags.generate_tags import create_tags_from_config_select
from feature_generation.v1.core.tags.tags import rlike

tag_config = [
    rlike(tag="occupation_data", input="occupation", values=["Data", "daata"]),
]

results = create_tags_from_config_select(df_sample, tag_config)

results.show(truncate=False)

### `rlike_multi_col`
This generates a tag if any of the patterns matches for multiple columns.
This function is more useful when you need to look across columns, for example,
the output should be 1 of column A equals X or column B equals Y.

In [None]:
from feature_generation.v1.core.tags.generate_tags import create_tags_from_config_select
from operator import and_
from feature_generation.v1.core.tags.tags import rlike_multi_col

tag_config = [
    rlike_multi_col(
        tag="partners_in_data_field",
        inputs=[
            {"input": "name", "values": ["Sansa", "Tyrion"]},
            {"input": "occupation", "values": ["Data", "daata"]},
        ],
        operator=and_,
    )
]

results = create_tags_from_config_select(df_sample, tag_config)

results.show(truncate=False)

### `array_rlike`
This generates a tag column if any elements in the array regex match the
pattern. This may be useful when the data comes in an array (usually to store a
1-N relationship without impacting the row count of the table).

In [None]:
from feature_generation.v1.core.tags.generate_tags import create_tags_from_config_select
from feature_generation.v1.core.tags.tags import array_rlike

tag_config = [
    array_rlike(tag="favourite_house", input="house", values=["Targaryen", "Stark"]),
]

results = create_tags_from_config_select(df_sample, tag_config)

results.show(truncate=False)

### `arrays_overlap`
This generates a tag column if any element in the list can be found in
the array. In contrast to `array_rlike`, `arrays_overlap` is more exact matches and
will return a tag if there is at least 1 overlapping element.

In [None]:
from feature_generation.v1.core.tags.generate_tags import create_tags_from_config_select
from feature_generation.v1.core.tags.tags import arrays_overlap

tag_config = [
    arrays_overlap(tag="even_numbers", input="number", values=[2, 4, 6, 8, 10]),
]

results = create_tags_from_config_select(df_sample, tag_config)

results.show(truncate=False)

### `arrays_not_overlap`
This generates a tag column if any of the list values cannot be found in the array.
Like `arrays_overlap` but with the `NOT`.

In [None]:
from feature_generation.v1.core.tags.generate_tags import create_tags_from_config_select
from feature_generation.v1.core.tags.tags import arrays_not_overlap

tag_config = [
    arrays_not_overlap(tag="odd_numbers", input="number", values=[2, 4, 6, 8, 10]),
]

results = create_tags_from_config_select(df_sample, tag_config)

results.show(truncate=False)

### `array_contains_all`
This generates a tag column if all elements in the list can be found in
the array.

In [None]:
from feature_generation.v1.core.tags.generate_tags import create_tags_from_config_select
from feature_generation.v1.core.tags.tags import array_contains_all

tag_config = [
    array_contains_all(tag="favourite_numbers", input="number", values=[2, 4]),
]

results = create_tags_from_config_select(df_sample, tag_config)

results.show(truncate=False)

### `array_not_contains_all`
This generates a tag column if the array does not contain all the elements.

In [None]:
from feature_generation.v1.core.tags.generate_tags import create_tags_from_config_select
from feature_generation.v1.core.tags.tags import array_not_contains_all

tag_config = [
    array_not_contains_all(
        tag="numbers_less_than_9", input="number", values=[9, 11, 13]
    ),
]

results = create_tags_from_config_select(df_sample, tag_config)

results.show(truncate=False)

### `dynamic_tag`
This function constructs a tag where the value of the tag is the evaluated through
`expr`. In most cases tags contain the value of either 0/1. This tag lets us
store a continuous number in the value.

In [None]:
from feature_generation.v1.core.tags.generate_tags import create_tags_from_config_select
from feature_generation.v1.core.tags.tags import dynamic_tag

tag_config = [
    dynamic_tag(tag="age_minus_2", expr="age-2"),
]

results = create_tags_from_config_select(df_sample, tag_config)

results.show(truncate=False)

### `one_hot_tag`
This function constructs a tag where the value is part of the tag name.
The idea is similar to one-hot-encoding, where each value generates a unique tag.

In [None]:
from feature_generation.v1.core.tags.generate_tags import create_tags_from_config_select
from feature_generation.v1.core.tags.tags import one_hot_tag

tag_config = [
    one_hot_tag(tag="age_minus_2", expr="age-2"),
]

results = create_tags_from_config_select(df_sample, tag_config)

results.show(truncate=False)

### `expr_tag`
This function adds tag as the column value if expression evaluates to True.
Looks similar to `dynamic_tag`, except the expression must produce a boolean.
The value of the tag will return 1 if the boolean evaluates to True.

In [None]:
from feature_generation.v1.core.tags.generate_tags import create_tags_from_config_select
from feature_generation.v1.core.tags.tags import expr_tag

tag_config = [
    expr_tag(tag="age_less_than_25", expr="age<25"),
]

results = create_tags_from_config_select(df_sample, tag_config)

results.show(truncate=False)

### `tag_array_rlike`
This generates a tag column if any elements in the tag array regex match
the pattern. This tag function can be used for creating fundamental,
derived and complex tags. This might be useful if you need to search the tag column
for the presence of a specific tag.

In [None]:
from feature_generation.v1.core.tags.generate_tags import create_tags_from_config_select
from feature_generation.v1.core.tags.tags import tag_array_rlike

tag_config = [
    tag_array_rlike(
        tag="occupation_in_data_field", input="tags", values=["occupation-data"]
    ),
]

results = create_tags_from_config_select(df_with_tag_sample, tag_config)

results.show(truncate=False)

### tag_arrays_overlap
This generates a tag column if any of the list values is found in the tag array.
This is the same as `arrays_overlap` but is designed to work on a tag array column.

In [None]:
from feature_generation.v1.core.tags.generate_tags import create_tags_from_config_select
from feature_generation.v1.core.tags.tags import tag_arrays_overlap

tag_config = [
    tag_arrays_overlap(
        tag="occupation_in_data_field",
        input="tags",
        values=["occupation-data", "extra-tag"],
    ),
]

results = create_tags_from_config_select(df_with_tag_sample, tag_config)

results.show(truncate=False)

### `tag_arrays_not_overlap`
This generates a tag column if elements in values cannot be found in the tag array.

In [None]:
from feature_generation.v1.core.tags.generate_tags import create_tags_from_config_select
from feature_generation.v1.core.tags.tags import tag_arrays_not_overlap

tag_config = [
    tag_arrays_not_overlap(
        tag="occupation_not_in_data_field",
        input="tags",
        values=["occupation-data", "extra-tag"],
    ),
]

results = create_tags_from_config_select(df_with_tag_sample, tag_config)

results.show(truncate=False)

### `tag_array_contains_all`
This generates a tag column if all the values can be found in the tag array.

In [None]:
from feature_generation.v1.core.tags.generate_tags import create_tags_from_config_select
from feature_generation.v1.core.tags.tags import tag_array_contains_all

tag_config = [
    tag_array_contains_all(
        tag="occupation_and_extra_data_field",
        input="tags",
        values=["occupation-data", "extra-tag"],
    ),
]

results = create_tags_from_config_select(df_with_tag_sample, tag_config)

results.show(truncate=False)

### `tag_array_not_contains_all`
This generates a tag column if the tag array does not contain all the elements.

In [None]:
from feature_generation.v1.core.tags.generate_tags import create_tags_from_config_select
from feature_generation.v1.core.tags.tags import tag_array_not_contains_all

tag_config = [
    tag_array_not_contains_all(
        tag="occupation_not_in_extra_data_field",
        input="tags",
        values=["occupation-data", "extra-tag"],
    ),
]

results = create_tags_from_config_select(df_with_tag_sample, tag_config)

results.show(truncate=False)

### Post Tag Creation Functions

After you have generated the tags, you still need to be able to extract them
out of the tag array.

### `Expand tags` and `Expand tags all`
After creating tags, `expand_tags` will help us in converting tags to columns
and further aggregate on desired set of columns.
However, the aggregation can be optional.

The `expand_tags_all` is a special case of `expand_tags` function which explodes
all tags present in the data into columns, rather than a set of tags defined
in list of tags to expand in case of `expand_tags`.


### Simple Usage - Extraction
This example will be shown in a config way, but you may simply replace the
dictionaries with the actual functions/objects themselves.

In [None]:
list_of_tags = ["occupation-data", "occupation-not-data"]

key_cols = ["gender"]

column_instructions = {
    "occupation_data_cnt": {
        "object": "pyspark.sql.functions.sum",
        "col": "occupation-data",
    },
    "occupation_not_data_cnt": {
        "object": "pyspark.sql.functions.sum",
        "col": "occupation-not-data",
    },
}

Let's say this is the list of tags we wish to extract:

In [None]:
print(list_of_tags)

We can pass this list to the function below:

In [None]:
from feature_generation.v1.core.tags.expand_tags import expand_tags

expand_tags_df = expand_tags(
    df_with_tags=df_with_tag_sample, tags_to_convert=list_of_tags
)

expand_tags_df.show(truncate=False)

### Alternative Usage - Automatic Extraction
This example shows an alternative way to explode tags using `expand_tags_all`.
This function expands all the tags present in the dataframe into separate columns.

Let's say this is the list of tags which are present in the dataset's `tags` column:

`["occupation-data", "occupation-not-data", "extra-tag"]`

We can expand the tags column using the following snippet of code:

In [None]:
from feature_generation.v1.core.tags.expand_tags import expand_tags_all

expand_tags_df = expand_tags_all(df_with_tags=df_with_tag_sample)

expand_tags_df.show(truncate=False)

### Advanced Usage - Extraction then Aggregation
A more common pattern is to extract the tags into column, then aggregate
to the desired unit of analysis.


In this case, we wish to aggregate on this column:

In [None]:
print(key_cols)

Along with the aggregation function to be applied:

In [None]:
import yaml

print(yaml.safe_dump(column_instructions))

We can pass these extra arguments:

In [None]:
from feature_generation.v1.nodes.tags.expand_tags import expand_tags

expand_tags_agg_df = expand_tags(
    df_with_tags=df_with_tag_sample,
    tags_to_convert=list_of_tags,
    key_cols=key_cols,
    column_instructions=column_instructions,
)
expand_tags_agg_df.show(truncate=False)