In [1]:
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
11,application_1616109604682_0003,pyspark,idle,Link,Link


SparkSession available as 'spark'.
<pyspark.sql.session.SparkSession object at 0x7fe922293fd0>

### Importing necessary libraries 

In [2]:
import hsfs
import datetime
from pyspark.sql import DataFrame, Row
from pyspark.sql.types import *
from pyspark.sql.functions import unix_timestamp, from_unixtime

connection = hsfs.connection()
# get a reference to the feature store, you can access also shared feature stores by providing the feature store name
fs = connection.get_feature_store();

Connected. Call `.close()` to terminate connection gracefully.

In [3]:
economy_fg_schema = StructType([
  StructField("id", IntegerType(), True),
  StructField("salary", FloatType(), True),
  StructField("commission", FloatType(), True),
  StructField("car", StringType(), True), 
  StructField("hvalue", FloatType(), True),      
  StructField("hyears", IntegerType(), True),     
  StructField("loan", FloatType(), True),
  StructField("year", IntegerType(), True)    
])

demographic_fg_schema = StructType([
  StructField("id", IntegerType(), True),
  StructField("age", IntegerType(), True),
  StructField("elevel", StringType(), True),   
  StructField("zipcode", StringType(), True)     
])

class_fg_schema =  StructType([
  StructField("id", IntegerType(), True),
  StructField("class", StringType(), True),
  StructField("year", IntegerType(), True)              
])

### Create spark dataframes for each Feature groups

In [4]:
economy_bulk_insert_data = [
    Row(1, 110499.73, 0.0,  "car15",  235000.0, 30, 354724.18, 2020),
    Row(2, 140893.77, 0.0,  "car20",  135000.0, 2, 395015.33, 2020),
    Row(3, 119159.65, 0.0,  "car1", 145000.0, 22, 122025.08, 2020),
    Row(4, 20000.0, 52593.63, "car9", 185000.0, 30, 99629.62, 2020)    
]

economy_bulk_insert_df = spark.createDataFrame(economy_bulk_insert_data, economy_fg_schema)

In [5]:
economy_bulk_insert_df.show()

+---+---------+----------+-----+--------+------+---------+----+
| id|   salary|commission|  car|  hvalue|hyears|     loan|year|
+---+---------+----------+-----+--------+------+---------+----+
|  1|110499.73|       0.0|car15|235000.0|    30| 354724.2|2020|
|  2|140893.77|       0.0|car20|135000.0|     2|395015.34|2020|
|  3|119159.65|       0.0| car1|145000.0|    22|122025.08|2020|
|  4|  20000.0|  52593.63| car9|185000.0|    30| 99629.62|2020|
+---+---------+----------+-----+--------+------+---------+----+

In [6]:
demographic_bulk_insert_data = [
    Row(1, 54, "level3", "zipcode5"),
    Row(2, 44, "level4", "zipcode8"),
    Row(3, 49, "level2", "zipcode4"),
    Row(4, 56, "level0", "zipcode2")    
]

demographic_bulk_insert_df = spark.createDataFrame(demographic_bulk_insert_data, demographic_fg_schema)

In [7]:
demographic_bulk_insert_df.show()

+---+---+------+--------+
| id|age|elevel| zipcode|
+---+---+------+--------+
|  1| 54|level3|zipcode5|
|  2| 44|level4|zipcode8|
|  3| 49|level2|zipcode4|
|  4| 56|level0|zipcode2|
+---+---+------+--------+

In [8]:
class_bulk_insert_data = [
    Row(1, "groupB", 2020),
    Row(2, "groupB", 2020),
    Row(3, "groupB", 2020),
    Row(4, "groupB", 2020)    
]

class_bulk_insert_df = spark.createDataFrame(class_bulk_insert_data, class_fg_schema)

In [None]:
# one function per feature
@hsfs_transformer(fg_name="fg1", fg_v = 1, feature_name="ft1")
def f1(x):
    "your logic here"
    return int(x)

@hsfs_transformer(fg_name="fg2",fg_v = 2, feature_name="ft2")
def f2(x):
    "your logic here"
    return str(x)

@hsfs_transformer("fg3", fg_v = 3, feature_name="ft3")
def f3(x):
    "your logic here"
    return float(x)

f4 = get_function("f4",1)



In [None]:
def pipe(fg_name1,feature_name1, fg_name2,feature_name2, fg_name3,feature_name3):
    f1(fg_name1,feature_name1)
    f2(fg_name2,feature_name2)
    f3(fg_name3,feature_name3)

pipe([ft1,ft2,ft3,f4])

In [None]:
def some():
    select() 
    int(x)
    str(x)
    float(x)
    

In [None]:
0, 0.1 0. 1

0 0.1
0.01
0.02

In [None]:
class Transformer(object):
    def __init__(self):
        print("[Transformer] Initializing...")
    def preprocess(self, inputs):
        print("[Transformer] Preprocess:")
        print(inputs)
        inputs -> primary dict_keys
        feature_vector = hsfs.get_serving_vector(dict_keys)
        trans_functions = hsfs.get_training_dataset(os.env[MODEL_NAME]).get_trans_func()
        schema = hsfs.get_training_dataset().td_query
        for trans_f in trans_functions:
            f_id = schema(trans_f.feature_name)
            results[f_id] = trans_f(feature_vector[f_id])
        pipe_trans_function = hsfs.get_training_dataset(os.env[MODEL_NAME]).get_trans_func()
        results = pipe_trans_function(feature_vector)
        return results
    def postprocess(self, outputs):
        print("[Transformer] Postprocess and return...")
        print(outputs)
        return outputs

In [9]:
class_bulk_insert_df.show()

+---+------+----+
| id| class|year|
+---+------+----+
|  1|groupB|2020|
|  2|groupB|2020|
|  3|groupB|2020|
|  4|groupB|2020|
+---+------+----+

In [10]:
economy_td = fs.create_training_dataset(
    name = "economy_td", 
    description = "Hudi Household Economy Feature Group",
    version=1,
    data_format='csv'
    transformer_fn = [f1,f2,f3,f4]
)

In [11]:
economy_td.save(class_bulk_insert_df)

<hsfs.training_dataset.TrainingDataset object at 0x7fe8bc73c390>

In [None]:
economy_td.attach_transformer_fn([f1,f2,f2])

In [12]:
economy_td.detach_transformer_fn([f1,f2,f2])

In [13]:
economy_td

<hsfs.training_dataset.TrainingDataset object at 0x7fe8bc73c390>

In [None]:
economy_td.attach_transformer_fn(identity, type="python", version=1)

In [12]:
economy_td.get_transformer_fn()

In [40]:
from hsfs import transformer_function
from hsfs import client
transformer_instance = transformer_function.TransformerFunction(
                                        name=identity.__name__,
                                        version=1,
                                        transformer_location='',
                                        transformer_type='python'
            )
        
_client = client.get_instance()
path_params = [
            "project",
            _client._project_id,
            "featurestores",
            67,
            "trainingdatasets",
            economy_td.id,
            "transformerFunctions",
        ]
headers = {"content-type": "application/json"}

_client._send_request(
                "POST",
                path_params,
                headers=headers,
                data=transformer_instance.json()
            )

An error was encountered:
Expecting value: line 1 column 1 (char 0)
Traceback (most recent call last):
  File "/srv/hops/anaconda/envs/theenv/lib/python3.7/site-packages/hsfs/decorators.py", line 35, in if_connected
    return fn(inst, *args, **kwargs)
  File "/srv/hops/anaconda/envs/theenv/lib/python3.7/site-packages/hsfs/client/base.py", line 155, in _send_request
    return response.json()
  File "/srv/hops/anaconda/envs/theenv/lib/python3.7/site-packages/requests/models.py", line 889, in json
    self.content.decode(encoding), **kwargs
  File "/srv/hops/anaconda/envs/theenv/lib/python3.7/json/__init__.py", line 348, in loads
    return _default_decoder.decode(s)
  File "/srv/hops/anaconda/envs/theenv/lib/python3.7/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/srv/hops/anaconda/envs/theenv/lib/python3.7/json/decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSO

'{"name": null, "version": null, "transformerLocation": null, "transformerType": null}'

In [13]:
economy_td = fs.get_training_dataset("economy_td",1)

In [14]:
economy_td.attach_transformation_function(identity)

In [15]:
economy_td.transformation_function(19)

19

In [15]:
from hsfs.core import training_dataset_engine
_training_dataset_engine = training_dataset_engine.TrainingDatasetEngine(67)

In [16]:
_transformation_function = _training_dataset_engine.get_transformation_function(economy_td)

An error was encountered:
[Errno 2] No such file or directory
Traceback (most recent call last):
  File "/srv/hops/anaconda/envs/theenv/lib/python3.7/site-packages/hsfs/core/training_dataset_engine.py", line 210, in get_transformation_function
    hdfs.load(transformer_fn_instance.transformer_location)
  File "/srv/hops/anaconda/envs/theenv/lib/python3.7/site-packages/pydoop/hdfs/__init__.py", line 143, in load
    with open(hdfs_path, **kwargs) as fi:
  File "/srv/hops/anaconda/envs/theenv/lib/python3.7/site-packages/pydoop/hdfs/__init__.py", line 113, in open
    encoding, errors)
  File "/srv/hops/anaconda/envs/theenv/lib/python3.7/site-packages/pydoop/hdfs/fs.py", line 280, in open_file
    f = self.fs.open_file(path, m, buff_size, replication, blocksize)
FileNotFoundError: [Errno 2] No such file or directory



In [18]:
_transformation_function(19)

19

In [18]:
from hsfs.core import training_dataset_api
_training_dataset_api = training_dataset_api.TrainingDatasetApi(
            67
        )
_training_dataset_api.get_transformation_function(economy_td)

[<hsfs.transformer_function.TransformerFunction object at 0x7fe8bbafcf50>]

In [19]:
from contextlib import ContextDecorator
from time import time

In [99]:
class HsfsTransormation(ContextDecorator):
    """Timing decorator."""

    def __init__(self, description):
        self.description = description

    def __enter__(self):
        print(self.description)
        self.start_time = time()

    def __exit__(self, *args):
        self.end_time = time()
        run_time = self.end_time - self.start_time
        print(f"The function took {run_time} seconds to run.")

In [100]:
class PysparkTransormation(HsfsTransormation):
    def __init__(self):
        super().__init__(description="pyspark")    

In [102]:
@PysparkTransormation()
def identity_tnx(x):
    return x

In [103]:
identity_tnx(19)

pyspark
The function took 5.245208740234375e-06 seconds to run.
19

In [88]:
def hsfs_transormation(func):
    def wrapper():
        print("create meta data object here")
        func()
        print("Something is happening after the function is called.")
    return wrapper

@hsfs_transormation
def say_whee():
    print("Whee!")

say_whee()

create meta data object here
Whee!
Something is happening after the function is called.

In [104]:
from abc import ABC, abstractmethod
 
class AbstractClassExample(ABC):
 
    def __init__(self, value):
        self.value = value
        super().__init__()
    
    @abstractmethod
    def do_something(self):
        pass

In [30]:
class DoAdd42(AbstractClassExample):

    def do_something(self):
        return self.value + 42
    
class DoMul42(AbstractClassExample):
   
    def do_something(self):
        return self.value * 42
    
x = DoAdd42(10)
y = DoMul42(10)

print(x.do_something())
print(y.do_something())

52
420

In [None]:
from abc import ABC, abstractmethod
 
class AbstractClassExample(ABC):
    
    @abstractmethod
    def do_something(self):
        print("Some implementation!")
        
class AnotherSubclass(AbstractClassExample):

    def do_something(self):
        super().do_something()
        print("The enrichment from AnotherSubclass")
        
x = AnotherSubclass()
x.do_something()