In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import functions as F
from framework.feature_factory.helpers import Helpers
from channelDemoMarket import Store

spark = SparkSession.builder.appName('Test').getOrCreate()
spark.conf.set("spark.sql.shuffle.partitions", 96*2)

In [None]:
# Istantiate store
store = Store(_snapshot_date = "2018-01-01")

In [None]:
# Get The feature factory
ff = store.ff

In [None]:
# Grab some sales features
mult_features, base_features = store.Sales().get_all()

In [None]:
### For fisualization only.
# Build a base dataframe from cores/sources
example_df = store.get_core("issuer")
print("n_rows: %d" %example_df.count())
print("n_cols: %d" %len(example_df.columns))
example_df.show(truncate=True, n=5)

In [None]:
### For visualization only.
# Build a base dataframe from cores/sources
bank_df = store.get_core("bank_id").alias('bank')
print("n_rows: %d" %bank_df.count())
print("n_cols: %d" %len(bank_df.columns))
bank_df.show(truncate=True, n=5)

In [None]:
# Build a base dataframe from cores/sources
store_sales_df = store.get_core("issuer").filter(col('purchase_txn_amt') > 0).alias('clean_amount')

In [None]:
print("n_rows: %d" %store_sales_df.count())
print("n_cols: %d" %len(store_sales_df.columns))
store_sales_df.show(truncate=True, n=5)

### Continue with example

Make a join!

In [None]:
base_df = store_sales_df.join(bank_df, ['issuer_name'])\
  .select('clean_amount.*', 'bank.issuer_id')
base_df.show(truncate=True, n=5)

### Show distinct categoricals

In [None]:
base_df.select("chip_indicator_uid").distinct().collect()

In [None]:
base_df.select("product_uid").distinct().collect()

### Build a Features Dataframe

Here, we are simply calling the aggregation methods in mult_features to be applied over a group by on 'issuer_name'

In [None]:
feature_df = ff.append_features(base_df, groupBy_cols = ['issuer_name'], feature_sets=[mult_features])
#feature_df = ff.append_features(store_sales_df, groupBy_cols = ['issuer_name'], feature_sets=[mult_features])
feature_df.show()

### Using Multipliers

We will create a new features dataframe using composite aggregations.

In [None]:
store.config.get_config('time_helpers').configs

In [None]:
time_multipliers = store.get_daterange_multiplier()

In [None]:
mult_by_time_features = mult_features.multiply(time_multipliers, "STORE")

### Nested features
Now let's assume we have several cateogical columns for which we want to calculate aggregates.

The categorical multiplier allows you to either specific to which columns you wish to apply the multiplier as well as a minimum distinct values count n and an ignore list of columns and it will efficiently find all the columns with < n distinct values

In [None]:
categorical_multiplier = Helpers().get_categoricals_multiplier(df = store.get_core("issuer"),
                                                               col_list = ['product_uid'])

by_time_by_cat = mult_by_time_features.multiply(categorical_multiplier, "STORE")

In [None]:
feature_df = ff.append_features(base_df,
                                groupBy_cols=['issuer_name'],
                                feature_sets=[mult_features, mult_by_time_features, by_time_by_cat])

In [None]:
print("n cols = %d" %len(feature_df.columns))
feature_df.show(truncate=True, n=5)

### Further nesting

In [None]:
categorical_multiplier_2 = Helpers().get_categoricals_multiplier(df = store.get_core("issuer"),
                                                               col_list = ['chip_indicator_uid'])

by_time_by_cat_by_cat = by_time_by_cat.multiply(categorical_multiplier_2, "STORE")

In [None]:
feature_df = ff.append_features(base_df,
                                groupBy_cols=['issuer_name'],
                                feature_sets=[by_time_by_cat_by_cat])

In [None]:
print("n cols = %d" %len(feature_df.columns))
feature_df.show(truncate=True, n=5)