In [1]:
import pyspark
import petastorm
from os.path import expanduser, join, abspath
from petastorm.etl.dataset_metadata import materialize_dataset
from petastorm.unischema import dict_to_spark_row, Unischema, UnischemaField
from pyspark.sql.functions import udf, struct
from petastorm.codecs import ScalarCodec, CompressedImageCodec, NdarrayCodec
from pyspark.sql.types import StructType, ArrayType, IntegerType, LongType, StringType, DoubleType, MapType
from petastorm.unischema import dict_to_spark_row, Unischema, UnischemaField
import numpy as np 
from petastorm.pytorch import DataLoader

# PARAMS
num_epochs = 1
batch_size = 64
row_group_size_mb = 64
num_partitions = 10
ONE_HOT_TYPE = np.int64

# where to put the petastorm parquet files
output_url = "file:///Users/kaiwenw/Desktop/ReAgent/my_petastorm_output"

# TODO: The below two should be stored in a config.
# list of all the actions (for one-hot encoding)
actions = ['0', '1']

# list of all the features and their possible values
feature_map = {
    'state_features': [0,1,2,3],
    'next_state_features': [0,1,2,3],
    'metrics': ['reward'],
}

warehouse_location = abspath('spark-warehouse')
spark = pyspark.sql.SparkSession \
    .builder \
    .master('local[1]') \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .enableHiveSupport() \
    .getOrCreate()



In [2]:
def pyspark_type_to_petastorm_type(t):
    """ scalar type conversions """
    if isinstance(t, DoubleType):
        return np.float64
    elif isinstance(t, IntegerType): 
        return np.int32
    elif isinstance(t, LongType):
        return np.int64
    else:
        raise NotImplementedError()

def get_petastorm_schema(actions, feature_map, sparse_pyspark_schema):
    """
    Does two functions:
    1) handles action/action_mask 
    2) performs sparse2dense
    Every scalar field should stay as is. 
    Every Map field should become dense (array). We assume that they are in features.
    They have a corresponding presence array.
    """
    unischema_fields = []
    def add_field(name, petastorm_type, shape, codec):
        # nothing can be null
        unischema_fields.append(
            UnischemaField(name, petastorm_type, shape, codec, False)
        )
    for struct_field in sparse_pyspark_schema:
        # first handle actions and action masks
        if struct_field.name in ["action", "next_action"]:
            add_field(
                name=struct_field.name,
                petastorm_type=ONE_HOT_TYPE,
                shape=(),
                codec=ScalarCodec(LongType()),
            )
        elif struct_field.name in ["possible_actions", "possible_next_actions"]:
            add_field(
                name=struct_field.name,
                petastorm_type=ONE_HOT_TYPE,
                shape=(len(actions), ),
                codec=NdarrayCodec(),
            )
        # now perform sparse2dense
        elif isinstance(struct_field.dataType, MapType):
            val_type = struct_field.dataType.valueType
            assert not isinstance(val_type, MapType), f"{struct_field.name} has Map type with value type of Map"
            # add presence array
            add_field(
                name=f"{struct_field.name}_presence",
                petastorm_type=np.int64,
                shape=(len(feature_map[struct_field.name]), ),
                codec=NdarrayCodec(),
            )
            # add dense array
            # also assume that mapped values are scalars
            add_field(
                name=struct_field.name,
                petastorm_type=pyspark_type_to_petastorm_type(val_type),
                shape=(len(feature_map[struct_field.name]), ),
                codec=NdarrayCodec(),
            )
        else:
            assert not isinstance(struct_field.dataType, ArrayType), f"{struct_field.name} has array type"
            # simply add scalar field
            add_field(
                name=struct_field.name,
                petastorm_type=pyspark_type_to_petastorm_type(struct_field.dataType),
                shape=(),
                codec=ScalarCodec(struct_field.dataType),
            )
            
    return Unischema("TimelineSchema", unischema_fields)

def preprocessing(actions, feature_map, schema):
    """ 
    Does two functions:
        1) handles action/action_mask 
        2) performs sparse2dense
    """
    def get_schema_type(name):
        return getattr(schema, name).numpy_dtype
    
    def find_action(desired_action):
        for i, a in enumerate(actions):
            if a == desired_action:
                return i
        return len(actions)

    def row_map(row):
        row_dict = row.asDict()
        # first handle the action/masks
        action_keys = ["action", "next_action"]
        for k in action_keys:
            row_dict[k] = find_action(row_dict[k])
            
        possible_action_keys = ["possible_actions", "possible_next_actions"]
        for k in possible_action_keys:
            mask = np.zeros(len(actions), dtype=ONE_HOT_TYPE)
            for a in row_dict[k]:
                i = find_action(a)
                assert i < len(actions)
                mask[i] = 1
            row_dict[k] = mask
        
        # now handle rest of the keys (including sparse2dense)
        rest_keys = row_dict.keys() - set(action_keys + possible_action_keys)
        for k in row_dict.keys() - set(action_keys + possible_action_keys):
            val_type = get_schema_type(k)
            val = row_dict[k]
            # convert sparse to dense
            if isinstance(val, dict):
                presence_arr = []
                dense_arr = []
                for feature in feature_map[k]:
                    # absent
                    if feature not in val:
                        presence_arr.append(0)
                        dense_arr.append(0.0) # TODO: assuming value type is a number
                    # present
                    else:
                        presence_arr.append(1)
                        dense_arr.append(val[feature])
                presence_key = f"{k}_presence"
                row_dict[presence_key] = np.array(presence_arr, dtype=get_schema_type(presence_key))
                row_dict[k] = np.array(dense_arr, dtype=val_type)
            # scalar
            else:
                assert not isinstance(val, list)
                row_dict[k] = val
        return dict_to_spark_row(schema, row_dict)
    
    return row_map


In [None]:
df = spark.sql("SELECT * FROM cartpole_discrete_training").drop("ds", "mdp_id")

In [None]:
df.show()

In [None]:
df = spark.sql("SELECT * FROM cartpole_discrete_training").drop("ds", "mdp_id")
schema = get_petastorm_schema(actions, feature_map, df.schema)

with materialize_dataset(spark, output_url, schema, row_group_size_mb):
    rdd = df.rdd.map(preprocessing(actions, feature_map, schema))
    out_df = spark.createDataFrame(rdd, schema.as_spark_schema()).coalesce(num_partitions)
    out_df.write.mode('overwrite').parquet(output_url)


In [3]:
from petastorm import make_reader, make_batch_reader
output_url = "file:///Users/kaiwenw/Desktop/ReAgent/training_data/cartpole_discrete_timeline"

reader = make_reader(output_url, num_epochs=num_epochs)
with DataLoader(reader, batch_size=batch_size) as train_loader:
    print(dir(train_loader))
#     for idx, row in enumerate(train_loader):
#         print(type(row))
#         print(idx, row)
#         if idx == 0:
#             break




RuntimeError: Currently make_reader supports reading only Petastorm datasets. To read from a non-Petastorm Parquet store use make_batch_reader

In [None]:
from petastorm.spark_utils import dataset_as_rdd
rdds = dataset_as_rdd(output_url, spark)
rdds.count()

In [None]:
""" Saves RLTimelineOperator's output as Petastorm parquet files.

Uses PySpark and Petastorm to convert RLTimelineOperator's output (in Hive) to
parquet files. A key functionality is that each feature in the form of a Map
(e.g. state_features), which Arrow can't yet handle, are converted to a dense
array and a presence array, which is 1 iff ith feature_id is present.

Additional transformations include: 
- action, which was originally a string, will be represented by the index in
the given list of possible actions
- possible_actions, which was originally a list of strings, will be represented
as a bitvector mask of length len(actions).
"""
import pyspark
import petastorm
from os.path import expanduser, join, abspath
from petastorm.etl.dataset_metadata import materialize_dataset
from petastorm.unischema import dict_to_spark_row, Unischema, UnischemaField
from pyspark.sql.functions import udf, struct
from petastorm.codecs import ScalarCodec, CompressedImageCodec, NdarrayCodec
from pyspark.sql.types import (
    StructType,
    ArrayType,
    IntegerType,
    LongType,
    StringType,
    DoubleType,
    MapType,
)
from petastorm.unischema import dict_to_spark_row, Unischema, UnischemaField
import numpy as np
from petastorm.pytorch import DataLoader

from typing import List, Dict, Any
from ml.rl.workflow.helpers import parse_args
import logging
import sys

# This is the type we use to store one-hot indices
ONE_HOT_TYPE = np.int64


def pyspark_to_numpy_types(pyspark_type):
    """ 
    Converts pyspark.sql.types to their numpy equivalent.
    """
    if isinstance(pyspark_type, DoubleType):
        return np.float64
    elif isinstance(pyspark_type, IntegerType):
        return np.int32
    elif isinstance(pyspark_type, LongType):
        return np.int64
    else:
        raise NotImplementedError(
            f"PySpark type {pyspark_type} does not have numpy equivalent."
        )


def get_petastorm_schema(
    actions: List[str], feature_map: Dict[str, List[Any]], sparse_pyspark_schema
) -> Unischema:
    """
    Creates the Petastorm storage schema from the schema of the
    RLTimelineOperator's sparse output. 

    Args:
        actions: List of possible actions
        feature_map: Some features (e.g. state_features) are mappings of 
            feature_id to value. This argument is a mapping of such features to
            the possible feature_ids that it may have. 
        sparse_pyspark_schema: Schema of RLTimelineOperator's output when loaded
            directly by PySpark.

    Returns:
        Petastorm storage schema.
    """
    unischema_fields = []

    def add_field(name, petastorm_type, shape, codec):
        # nothing can be null
        unischema_fields.append(
            UnischemaField(name, petastorm_type, shape, codec, False)
        )

    for struct_field in sparse_pyspark_schema:
        # first handle actions and action masks
        if struct_field.name in ["action", "next_action"]:
            add_field(
                name=struct_field.name,
                petastorm_type=ONE_HOT_TYPE,
                shape=(),
                codec=ScalarCodec(LongType()),
            )
        elif struct_field.name in ["possible_actions", "possible_next_actions"]:
            add_field(
                name=struct_field.name,
                petastorm_type=ONE_HOT_TYPE,
                shape=(len(actions),),
                codec=NdarrayCodec(),
            )
        # now perform sparse2dense
        elif isinstance(struct_field.dataType, MapType):
            val_type = struct_field.dataType.valueType
            assert not isinstance(
                val_type, MapType
            ), f"{struct_field.name} has Map type with value type of Map"
            # add presence array
            add_field(
                name=f"{struct_field.name}_presence",
                petastorm_type=np.int64,
                shape=(len(feature_map[struct_field.name]),),
                codec=NdarrayCodec(),
            )
            # add dense array
            # also assume that mapped values are scalars
            add_field(
                name=struct_field.name,
                petastorm_type=pyspark_to_numpy_types(val_type),
                shape=(len(feature_map[struct_field.name]),),
                codec=NdarrayCodec(),
            )
        else:
            assert not isinstance(
                struct_field.dataType, ArrayType
            ), f"{struct_field.name} has array type"
            # simply add scalar field
            add_field(
                name=struct_field.name,
                petastorm_type=pyspark_to_numpy_types(struct_field.dataType),
                shape=(),
                codec=ScalarCodec(struct_field.dataType),
            )

    return Unischema("TimelineSchema", unischema_fields)


def preprocessing(
    actions: List[str], feature_map: Dict[str, List[Any]], petastorm_schema
):
    """ 
    Returns an RDD mapping function (mapping function on Rows) that converts 
    the RLTimelineOperator's output into the petastorm format, with actions
    one-hot encoded, with sparse features converted to dense, and with presence
    arrays.

    Args:
        actions: same as get_petastorm_schema
        feature_map: same as get_petastorm_schema
        petastorm_schema: Desired schema of the petastorm dataframe. 
    """

    def get_schema_type(feature_name):
        """ Returns the numpy dtype associated with the feature. """
        assert hasattr(
            petastorm_schema, feature_name
        ), f"{feature_name} does not exist."
        return getattr(petastorm_schema, feature_name).numpy_dtype

    def find_action_idx(desired_action):
        """ Returns the index of action in the list of possible actions. 
        If not found in the list of possible actions, return the number of
        possible actions.
        """
        for i, a in enumerate(actions):
            if a == desired_action:
                return i
        return len(actions)

    def row_map(row):
        """ The RDD mapping function """
        # convert Row to dict
        row_dict = row.asDict()

        # first handle one-hot encoding of action/masks
        action_keys = ["action", "next_action"]
        for k in action_keys:
            row_dict[k] = find_action_idx(row_dict[k])

        possible_action_keys = ["possible_actions", "possible_next_actions"]
        for k in possible_action_keys:
            mask = np.zeros(len(actions), dtype=ONE_HOT_TYPE)
            for a in row_dict[k]:
                i = find_action_idx(a)
                assert i < len(actions), f"action {a} (type {type(a)}) was not found in {actions}"
                mask[i] = 1
            row_dict[k] = mask

        # now handle rest of keys, including converting sparse to dense
        rest_features = row_dict.keys() - set(action_keys + possible_action_keys)
        for feature in rest_features:
            val_type = get_schema_type(feature)
            val = row_dict[feature]
            # convert sparse to dense
            if isinstance(val, dict):
                presence_arr = []
                dense_arr = []
                # for every possible feature_id, check if it is part of
                # the sparse representation. If it is, mark it as present and
                # record it in the dense array. If it isn't, mark it
                # as absent and record a default value in dense array.
                for feature_id in feature_map[feature]:
                    if feature_id in val:
                        presence_arr.append(1)
                        dense_arr.append(val[feature_id])
                    else:
                        presence_arr.append(0)
                        # we assume values are a number here
                        dense_arr.append(0.0)
                presence_key = f"{feature}_presence"
                row_dict[presence_key] = np.array(
                    presence_arr, dtype=get_schema_type(presence_key)
                )
                row_dict[feature] = np.array(dense_arr, dtype=val_type)
            # if not a sparse map, simply copy it
            else:
                assert not isinstance(val, list)
                row_dict[feature] = val
        return dict_to_spark_row(petastorm_schema, row_dict)

    return row_map


def save_petastorm_dataset(
    actions, feature_map, input_table, output_table, warehouse_dir="spark-warehouse", row_group_size_mb=256
):
    """ Assuming RLTimelineOperator output is in Hive storage at warehouse_dir,
        save the petastorm dataset after preprocessing.

        Args:
            actions: same as get_petastorm_schema
            feature_map: same as get_petastorm_schema
            input_table: table to read output of RLTimelineOperator
            output_table: location to store the dataset
            warehouse_dir: location where RLTimelineOperator stored data
            row_group_size_mb: parquet row group size for dataset
    """
    warehouse_dir = abspath(warehouse_dir)
    output_uri = f"file://{abspath(output_table)}"
    spark = (
        pyspark.sql.SparkSession.builder.master("local[1]")
        .enableHiveSupport()
        .getOrCreate()
    )

    # Read dataframe, and drop unnecessary columns
#     df = spark.sql(f"SELECT * FROM {input_table}")# .drop("ds", "mdp_id")
    df = spark.read.table(input_table).drop("mdp_id")
    peta_schema = get_petastorm_schema(actions, feature_map, df.schema)
    with materialize_dataset(spark, output_uri, peta_schema, row_group_size_mb):
        rdd = df.rdd.map(preprocessing(actions, feature_map, peta_schema))
        peta_df = spark.createDataFrame(rdd, peta_schema.as_spark_schema())
        peta_df.write.mode("overwrite").parquet(output_uri)


train_input_table = "cartpole_discrete_training"
train_output_table = "training_data/cartpole_discrete_timeline"
# eval_input_table = "cartpole_discrete_eval"
# eval_output_table = "training_data/cartpole_discrete_timeline_eval"

# list of all the actions (for one-hot encoding)
actions = [0, 1]

# list of all the features and their possible values
feature_map = {
    "state_features": [0, 1, 2, 3],
    "next_state_features": [0, 1, 2, 3],
    "metrics": ["reward"],
}

save_petastorm_dataset(
    actions, feature_map, train_input_table, train_output_table
)
logger.info(f"Saved training table as {train_output_table}")

# save_petastorm_dataset(
#     actions, feature_map, eval_input_table, eval_output_table
# )
# logger.info(f"Saved training table as {eval_output_table}")



In [None]:
spark.sql("SHOW TABLES").show(10, False)


In [None]:
table = "cartpole_discrete_training"
# df = spark.sql(f"SELECT * FROM {table}")
df = spark.read.table(table)


In [None]:
df.show()

In [None]:
df.schema

In [None]:
from os.path import expanduser, join, abspath
import pyspark

warehouse_location = abspath('spark-warehouse')
spark = pyspark.sql.SparkSession \
    .builder \
    .master('local[1]') \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .enableHiveSupport() \
    .getOrCreate()
spark.sql("SHOW TABLES").show(20, False)

In [None]:
# train_df = spark.read.json("cartpole_discrete_training")
# eval_df = spark.read.json("cartpole_discrete_eval")
warehouse_df = spark.sql("SELECT * FROM cartpole_discrete_training")
warehouse_eval_df = spark.sql("SELECT * FROM cartpole_discrete_eval")

In [None]:
train_df.count()

In [None]:
eval_df.count()

In [None]:
warehouse_df.count()

In [None]:
warehouse_eval_df.count()

In [None]:
# read as rdd
from petastorm.spark_utils import dataset_as_rdd
rdd = dataset_as_rdd(output_url, spark, [field for name, field in schema.fields.items()])
rdd.collect()[0]