In [1]:
from pyspark.sql.functions import col
from pyspark.sql import functions as F
from framework.feature_factory.helpers import Helpers
from channelDemoMarket import Store

##spark.conf.set("spark.sql.shuffle.partitions", 96*2)

In [2]:
# Istantiate store
store = Store(_snapshot_date = "2018-01-01")
#store = Store()

In [3]:
# Get The feature factory
ff = store.ff

In [4]:
# Grab some sales features
mult_features, base_features = store.Sales().get_all()

In [5]:
### For fisualization only.
# Build a base dataframe from cores/sources
example_df = store.get_core("issuer")
print("n_rows: %d" %example_df.count())
print("n_cols: %d" %len(example_df.columns))
example_df.show(truncate=True, n=5)

n_rows: 5613
n_cols: 7
+--------------------+-----------+------------------+-----------+--------------------+----------------+----------------+
|header_capture_month|issuer_name|chip_indicator_uid|product_uid|         description|purchase_txn_amt|purchase_txn_cnt|
+--------------------+-----------+------------------+-----------+--------------------+----------------+----------------+
| 2016-09-01 00:00:00| Royal Bank|                 7|          1|     Hardware Stores|   7.127944081E7|          919933|
| 2017-04-01 00:00:00| Scotiabank|                 9|          1|Drug Stores, Phar...|   1.672346653E7|          784273|
| 2018-12-01 00:00:00| Royal Bank|                 3|          1|Grocery Stores, S...|       100088.58|            1194|
| 2018-08-01 00:00:00|       CIBC|                 7|          1|     Hardware Stores|   6.163271108E7|          706047|
| 2017-09-01 00:00:00|       CIBC|                 3|          1|Fuel Dispenser, A...|             0.0|               0|
+--------

In [6]:
### For fisualization only.
# Build a base dataframe from cores/sources
#bank_df = store.get_core("bank_id").alias('bank')
#print("n_rows: %d" %bank_df.count())
#print("n_cols: %d" %len(bank_df.columns))
#bank_df.show(truncate=True, n=5)

In [7]:
#example_df.select("issuer_name").distinct().show()

In [8]:
# Build a base dataframe from cores/sources
store_sales_df = store.get_core("issuer").filter(col('purchase_txn_amt') > 0).alias('clean_amount')

In [9]:
print("n_rows: %d" %store_sales_df.count())
print("n_cols: %d" %len(store_sales_df.columns))
store_sales_df.show(truncate=True, n=5)

n_rows: 5342
n_cols: 7
+--------------------+---------------+------------------+-----------+--------------------+----------------+----------------+
|header_capture_month|    issuer_name|chip_indicator_uid|product_uid|         description|purchase_txn_amt|purchase_txn_cnt|
+--------------------+---------------+------------------+-----------+--------------------+----------------+----------------+
| 2016-09-01 00:00:00|     Royal Bank|                 7|          1|     Hardware Stores|   7.127944081E7|          919933|
| 2017-04-01 00:00:00|     Scotiabank|                 9|          1|Drug Stores, Phar...|   1.672346653E7|          784273|
| 2018-12-01 00:00:00|     Royal Bank|                 3|          1|Grocery Stores, S...|       100088.58|            1194|
| 2018-08-01 00:00:00|           CIBC|                 7|          1|     Hardware Stores|   6.163271108E7|          706047|
| 2018-10-01 00:00:00|TD Canada Trust|                 3|          1|Eating Places, Re...|           9

In [10]:
store_sales_df.agg({"header_capture_month": "min"}).collect()[0][0]

datetime.datetime(2016, 1, 1, 0, 0)

### Continue with example

Make a join!

In [11]:
#base_df = store_sales_df.join(bank_df, ['issuer_name'])\
#  .select('clean_amount.*', 'bank.issuer_id')

In [12]:
#base_df.show(truncate=True, n=5)

### Build the Features Dataframe

In [13]:
#feature_df = ff.append_features(base_df, groupBy_cols = ['issuer_name'], feature_sets=[mult_features])
feature_df = ff.append_features(store_sales_df, groupBy_cols = ['issuer_name'], feature_sets=[mult_features])

In [14]:
feature_df.show()

+--------------------+--------------------+--------------+
|         issuer_name|           net_sales|total_quantity|
+--------------------+--------------------+--------------+
|BMO Bank of Montreal|  8.85825478646001E9|     286346816|
|          Royal Bank|1.919939227528001...|     636845558|
|          Scotiabank|1.126012862541997...|     385629905|
|     TD Canada Trust|2.102817429950998...|     734890180|
|                CIBC|1.428790330563999E10|     483494762|
+--------------------+--------------------+--------------+



### Write to Disk and Read the Aggregated Data

In [15]:
#feature_df.write.format("parquet").mode("overwrite").save("./temp/base_feats_df_out")

In [16]:
#from pyspark.sql import SparkSession
#spark = SparkSession.builder.appName('Test').getOrCreate()
#read_df = spark.read.format("parquet").load("./temp/base_feats_df_out")
#read_df.show(truncate=True, n=5)

### Using Multipliers

In [17]:
store.config.get_config('time_helpers').configs

{'snapshot_type': 'DAILY',
 'snapshot_date': '2018-01-01',
 'partition_col': None,
 'date_col': 'header_capture_month',
 'date_col_format': '%Y-%m-%d',
 'partition_col_format': '%Y%m',
 'date_filters': {'ranges': {'1m': {'start': '2017-12-01',
    'end': '2018-01-01'},
   '3m': {'start': '2017-10-01', 'end': '2018-01-01'}}},
 'partition_lower': '201710',
 'partition_upper': '201801'}

In [18]:
time_multipliers = store.get_daterange_multiplier()

In [19]:
mult_by_time_features = mult_features.multiply(time_multipliers, "STORE")

In [22]:
feature_df = ff.append_features(store_sales_df,
                                groupBy_cols = ['clean_amount.chip_indicator_uid'],
                                feature_sets=[mult_features, mult_by_time_features])


In [23]:
feature_df.show()

+------------------+--------------------+--------------+--------------------+--------------------+-----------------------+-----------------------+
|chip_indicator_uid|           net_sales|total_quantity|  STORE_1M_NET_SALES|  STORE_3M_NET_SALES|STORE_1M_TOTAL_QUANTITY|STORE_3M_TOTAL_QUANTITY|
+------------------+--------------------+--------------+--------------------+--------------------+-----------------------+-----------------------+
|                 6|               52.41|             1|                 0.0|               52.41|                      0|                      1|
|                 3| 5.409171115000006E7|       1696271|  390674.47000000003|  1088949.6099999999|                   6348|                  20523|
|                 5|      1.4351703091E8|       3915814|          4527384.78|       1.252198736E7|                 102887|                 311881|
|                 9|1.683412248285999...|    1000494061| 5.750419964200001E8|1.5866615026900005E9|               31548

In [None]:
feature_df.write.format("parquet").mode("overwrite").save("/tmp/tomes/ff/demo/mults1_feats_df_out")


In [None]:
display(spark.read.format("parquet").load("/tmp/tomes/ff/demo/mults1_feats_df_out"))